In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

model_name = "manishiitg/resume-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

text = """
B.Tech in Computer Science at Amrita Vishwa Vidyapeetham (2021 – 2025)
Flutter Developer Intern at Go Eleventh Mile from Nov 2024 to Present
"""

ner_results = nlp(text)
for res in ner_results:
    print(f"{res['word']} → {res['entity_group']}")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


b → EducationDegree
tech → EducationDegree
amrita vishwa vidyapeetham → ORG
( 2021 – 2025 → DATE
flutter developer intern → Designation
nov 2024 → DATE
present → DATE


In [5]:
import os
import re
import docx2txt
import fitz  # PyMuPDF
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch

# ------------ Load Resume NER Model ------------ #
model_name = "manishiitg/resume-ner"
device = 0 if torch.cuda.is_available() else -1  # Use GPU if available

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

# ------------ Text Extraction ------------ #
def extract_text(file_path):
    ext = os.path.splitext(file_path)[-1].lower()
    if ext == ".pdf":
        doc = fitz.open(file_path)
        return "\n".join([page.get_text() for page in doc])
    elif ext == ".docx":
        return docx2txt.process(file_path)
    else:
        raise ValueError("Unsupported file type. Please use PDF or DOCX.")

# ------------ Basic Info (Regex) ------------ #
def extract_basic_info(text):
    name_match = re.search(r'(?i)^([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})', text)
    email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    phone_match = re.search(r'(\+?\d[\d\s\-\(\)]{8,})', text)

    return {
        "Name": name_match.group(1).strip() if name_match else None,
        "Email": email_match.group(0).strip() if email_match else None,
        "Phone Number": phone_match.group(0).strip() if phone_match else None
    }

def clean_output(obj):
    return {k: v.strip().title() if isinstance(v, str) and v else v for k, v in obj.items()}

# ------------ Structured Resume Info ------------ #
def extract_resume_info_structured(text):
    sentences = re.split(r'(?<=[\.\?\!\n])\s+', text)
    education = []
    experience = []

    for sent in sentences:
        ents = ner_pipeline(sent)
        if not ents:
            continue

        tags = defaultdict(str)
        dates = []

        for ent in ents:
            word = re.sub(r"\s+", " ", ent['word'].replace("##", "")).strip()
            entity_type = ent['entity_group']
            tags[entity_type] += word + " "
            if entity_type == 'DATE':
                dates.append(word)

        for key in tags:
            tags[key] = tags[key].strip()

        date = None
        if len(dates) >= 2:
            date = f"{dates[0]} - {dates[1]}"
        elif len(dates) == 1:
            date = dates[0]

        # === Education ===
        is_edu = (
            'EducationDegree' in tags or
            re.search(r'\b(b\.?tech|m\.?tech|bachelor|master|engineering|intermediate|school|college|university)\b', sent, re.I)
        )
        if is_edu and ('ORG' in tags or 'INSTITUTE' in tags):
            degree = tags.get('EducationDegree')
            university = tags.get('ORG') or tags.get('INSTITUTE')

            # Fallback
            if not degree:
                m = re.search(r'(b\.?tech|m\.?tech|bachelor|master|engineering)', sent, re.I)
                degree = m.group(0) if m else None

            if degree and university:
                education.append(clean_output({
                    "degree": degree,
                    "university": university,
                    "date": date
                }))

        # === Work Experience ===
        if 'Designation' in tags:
            role = tags.get('Designation')
            company = tags.get('ORG') or tags.get('COMPANY')

            # Fallback for freelance
            if not company:
                match = re.search(r'at\s+([A-Z][a-zA-Z]+)', sent)
                company = match.group(1) if match else None

            if role and company:
                experience.append(clean_output({
                    "designation": role,
                    "organization": company,
                    "date": date
                }))

    return {
        "Education": education,
        "Work Experience": experience
    }

# ------------ Main ------------ #
# if __name__ == "__main__":
#     resume_path = "../test-resume/Manoj Resume.pdf"  # Update path

#     if not os.path.exists(resume_path):
#         print(f"Error: File '{resume_path}' not found.")
#     else:
#         resume_text = extract_text(resume_path)
#         parsed_info = extract_resume_info_structured(resume_text)

#         print("\nEducation:")
#         for edu in parsed_info["Education"]:
#             print("-", edu)

#         print("\nWork Experience:")
#         for exp in parsed_info["Work Experience"]:
#             print("-", exp)
def process_resumes_in_folder(folder_path: str):
    for file_name in sorted(os.listdir(folder_path)):
        if not file_name.lower().endswith(('.pdf', '.docx')):
            continue

        file_path = os.path.join(folder_path, file_name)
        print(f"\n📄 Processing: {file_name}")

        try:
            text = extract_text(file_path)
            parsed_info = extract_resume_info_structured(text)
            
            print("\nEducation:")
            for edu in parsed_info["Education"]:
                print("-", edu)
            
            print("\nWork Experience:")
            for work in parsed_info["Work Experience"]:
                print("-", work)
            # basic_info = extract_basic_info(text)
            # structured = extract_resume_info_structured(text)

            # print("\nBASIC INFO:")
            # print(basic_info)

            # print("\nEducation:")
            # for edu in structured["Education"]:
            #     print("-", edu)

            # print("\nWork Experience:")
            # for work in structured["Work Experience"]:
            #     print("-", work)

        except Exception as e:
            print(f"⚠️ Error processing {file_name}: {e}")
            
process_resumes_in_folder("../test-resume")



📄 Processing: Achyuth_Resume.docx

Education:

Work Experience:

📄 Processing: K Nishanth Java.docx

Education:
- {'degree': 'Bachelor Of Technology', 'university': 'Indian Institute Of Technology, Kharagpur', 'date': '2013'}

Work Experience:

📄 Processing: Manoj Resume.pdf

Education:
- {'degree': 'Engineering', 'university': 'Sri Chaitanya College Of Education K K R S Go Ham School', 'date': '2021 – Present - 2019 – 2021'}

Work Experience:
- {'designation': 'Flutter Developer Intern', 'organization': 'Amrita Mind And Brain Center', 'date': 'November 2024 - Present'}
- {'designation': 'Hardware Security Researcher', 'organization': 'Redant', 'date': 'July 2023 - October 202'}
- {'designation': 'Flutter Developer', 'organization': 'Ostello Ai', 'date': 'June 2023 - September 202'}

📄 Processing: Siddharth_Reddy_Resume.pdf

Education:
- {'degree': 'B Tech', 'university': 'Symbosystems', 'date': '2021 – 2025 - 2019 – 2021'}

Work Experience:
- {'designation': 'Ai Full Stack Developer'

In [6]:
import os
import re
import docx2txt
import fitz  # PyMuPDF
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# ------------ Load Resume NER Model ------------ #

model_name = "manishiitg/resume-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# ------------ Text Extraction ------------ #

def extract_text(file_path):
    ext = os.path.splitext(file_path)[-1].lower()
    if ext == ".pdf":
        doc = fitz.open(file_path)
        return "\n".join([page.get_text() for page in doc])
    elif ext == ".docx":
        return docx2txt.process(file_path)
    else:
        raise ValueError("Unsupported file type. Please use PDF or DOCX.")

# ------------ Basic Info (Regex) ------------ #

def extract_basic_info(text):
    name_match = re.search(r'(?i)^([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})', text)
    email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    phone_match = re.search(r'(\+?\d[\d\s\-\(\)]{8,})', text)

    return {
        "Name": name_match.group(1).strip() if name_match else None,
        "Email": email_match.group(0).strip() if email_match else None,
        "Phone Number": phone_match.group(0).strip() if phone_match else None
    }

def clean_output(obj):
    return {k: v.strip().title() if isinstance(v, str) and v else v for k, v in obj.items()}

# ------------ Helper: Fallback Date Extraction ------------ #

def extract_dates_from_text(text):
    matches = re.findall(r'(?i)(\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\.?\s*\d{4})', text)
    if len(matches) >= 2:
        return f"{matches[0]} - {matches[1]}"
    elif len(matches) == 1:
        return matches[0]
    return None

# ------------ Structured Resume Info ------------ #

def extract_resume_info_structured(text):
    sentences = re.split(r'(?<=[\.\?\!\n])\s+', text)

    education = []
    experience = []

    last_date = None
    last_org = None

    for sent in sentences:
        ents = ner_pipeline(sent)
        if not ents:
            continue

        tags = defaultdict(str)
        date_tokens = []

        for ent in ents:
            clean_word = re.sub(r"\s+", " ", ent['word'].replace("##", "")).strip()
            entity_group = ent['entity_group']

            if entity_group == 'DATE':
                date_tokens.append(clean_word)
            else:
                tags[entity_group] += clean_word + " "

        # Clean final tag strings
        for key in tags:
            tags[key] = tags[key].strip()

        # Date handling (NER or regex fallback)
        date = None
        if len(date_tokens) >= 2:
            date = f"{date_tokens[0]} - {date_tokens[1]}"
        elif len(date_tokens) == 1:
            date = date_tokens[0]
        else:
            date = extract_dates_from_text(sent)

        # Update last known values
        if date:
            last_date = date
        if 'ORG' in tags or 'INSTITUTE' in tags or 'COMPANY' in tags:
            last_org = tags.get('ORG') or tags.get('INSTITUTE') or tags.get('COMPANY')

        # === Education ===
        is_edu = (
            'EducationDegree' in tags or
            re.search(r'\b(b\.tech|m\.tech|bachelor|master|engineering|intermediate|school|college|university)\b', sent, re.IGNORECASE)
        )

        if is_edu:
            degree = tags.get('EducationDegree')
            org = tags.get('ORG') or tags.get('INSTITUTE') or last_org
            if degree or org:
                education.append(clean_output({
                    "degree": degree,
                    "university": org,
                    "date": date or last_date
                }))

        # === Work Experience ===
        if 'Designation' in tags:
            designation = tags['Designation']
            company = tags.get('ORG') or tags.get('COMPANY') or last_org
            if designation or company:
                experience.append(clean_output({
                    "designation": designation,
                    "organization": company,
                    "date": date or last_date
                }))

    return {
        "Education": education,
        "Work Experience": experience
    }

# ------------ Main ------------ #

# if __name__ == "__main__":
#     resume_path = "../test-resume/Siddharth_Reddy_Resume.pdf"  # Update to your path

#     if not os.path.exists(resume_path):
#         print(f"Error: File '{resume_path}' not found.")
#     else:
#         resume_text = extract_text(resume_path)
#         basic_info = extract_basic_info(resume_text)
#         parsed = extract_resume_info_structured(resume_text)

#         print("\nBASIC INFO:")
#         print(basic_info)

#         for section, items in parsed.items():
#             print(f"\n{section}:")
#             for item in items:
#                 print(f"- {item}")

# ------------ Batch Folder Parser ------------ #
def process_resumes_in_folder(folder_path: str):
    for file_name in sorted(os.listdir(folder_path)):
        if not file_name.lower().endswith(('.pdf', '.docx')):
            continue

        file_path = os.path.join(folder_path, file_name)
        print(f"\n📄 Processing: {file_name}")

        try:
            text = extract_text(file_path)
            basic_info = extract_basic_info(text)
            structured = extract_resume_info_structured(text)

            print("\nBASIC INFO:")
            print(basic_info)

            print("\nEducation:")
            for edu in structured["Education"]:
                print("-", edu)

            print("\nWork Experience:")
            for work in structured["Work Experience"]:
                print("-", work)

        except Exception as e:
            print(f"⚠️ Error processing {file_name}: {e}")
            
process_resumes_in_folder("../test-resume")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.



📄 Processing: Achyuth_Resume.docx

BASIC INFO:
{'Name': 'Achyuth Rao', 'Email': 'rsda.sada-8@gmail.com', 'Phone Number': '+1 (124)-456-1232'}

Education:

Work Experience:
- {'designation': '/ Cloud Engineer', 'organization': None, 'date': None}
- {'designation': 'Build Engineer', 'organization': None, 'date': None}
- {'designation': 'Engineer', 'organization': None, 'date': 'Mar 2021 - Present'}
- {'designation': 'Engineer', 'organization': 'Assure Care Llc', 'date': 'Aug 2020 – Mar2021'}
- {'designation': 'Engineer', 'organization': 'Assure Care Llc', 'date': 'Jan 2019 - Nov 2020'}
- {'designation': 'Build Engineer', 'organization': 'Centene Corporation', 'date': 'Jan 2018 – July 2019'}
- {'designation': 'Linux System Administrator', 'organization': 'Centene Corporation', 'date': 'May 2016 - Jan 2018'}

📄 Processing: K Nishanth Java.docx

BASIC INFO:
{'Name': None, 'Email': 'asdasfadaf@gmail.com', 'Phone Number': '424) 324-3454'}

Education:
- {'degree': 'Bachelor Of Technology', 'u

In [3]:
import os
import re
import docx2txt
import fitz  # PyMuPDF
from collections import defaultdict
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# ------------ Load Resume NER Model ------------ #
model_name = "manishiitg/resume-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

device = 0 if os.environ.get("CUDA_VISIBLE_DEVICES", None) else -1
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

# ------------ Text Extraction ------------ #
def extract_text(file_path: str) -> str:
    ext = os.path.splitext(file_path)[-1].lower()
    if ext == ".pdf":
        doc = fitz.open(file_path)
        return "\n".join([page.get_text() for page in doc])
    elif ext == ".docx":
        return docx2txt.process(file_path)
    else:
        raise ValueError("Unsupported file type. Use PDF or DOCX.")

# ------------ Basic Info (Regex) ------------ #
def extract_basic_info(text: str) -> Dict[str, str]:
    name_match = re.search(
        r'^(?!.*(?:experience|institute|education|skills))([A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*){1,3})',
        text, re.MULTILINE
    )
    email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    phone_match = re.search(r'(\+?\d[\d\s\-\(\)]{8,}\d)', text)

    phone = re.sub(r'[^\d+]', '', phone_match.group(0)) if phone_match else None

    return {
        "Name": name_match.group(1).strip() if name_match else None,
        "Email": email_match.group(0).strip() if email_match else None,
        "Phone Number": phone
    }

def clean_output(obj: Dict[str, str]) -> Dict[str, str]:
    return {k: v.strip().title() if isinstance(v, str) and v else v for k, v in obj.items()}

def deduplicate(entries: List[Dict[str, str]]) -> List[Dict[str, str]]:
    seen = set()
    unique = []
    for entry in entries:
        key = tuple(sorted(entry.items()))
        if key not in seen:
            seen.add(key)
            unique.append(entry)
    return unique

# ------------ Structured Resume Info ------------ #
def extract_resume_info_structured(text: str) -> Dict[str, List[Dict[str, str]]]:
    sentences = re.split(r'(?<=[\.\?\!\n])\s+', text)
    education = []
    experience = []

    for sent in sentences:
        ents = ner_pipeline(sent)
        if not ents:
            continue

        tags = defaultdict(str)
        date_tokens = []

        for ent in ents:
            clean_word = re.sub(r"\s+", " ", ent['word'].replace("##", "")).strip()
            tags[ent['entity_group']] += clean_word + " "
            if ent['entity_group'] == 'DATE':
                date_tokens.append(clean_word)

        for key in tags:
            tags[key] = tags[key].strip()

        # Date cleanup
        date = None
        if len(date_tokens) >= 2:
            date = f"{date_tokens[0]} - {date_tokens[1]}"
        elif len(date_tokens) == 1:
            date = date_tokens[0]

        if date and '-' in date:
            parts = date.split('-')
            if len(parts) > 2:
                date = f"{parts[0].strip()} - {parts[1].strip()}"

        # === Education ===
        is_edu_sentence = (
            'EducationDegree' in tags or
            re.search(r'\b(b\.tech|m\.tech|bachelor|master|engineering|school|college|university)\b', sent, re.IGNORECASE)
        )

        if is_edu_sentence and ('ORG' in tags or 'INSTITUTE' in tags):
            degree = tags.get('EducationDegree', None)
            university = tags.get('ORG') or tags.get('INSTITUTE', None)

            if not degree:
                match = re.search(r'(b\.tech|m\.tech|bachelor|master|engineering|b\.e\.|m\.e\.|computer science|cse)', sent, re.IGNORECASE)
                degree = match.group(0).strip() if match else None

            if university and not re.search(r'(school|college|university|institute|academy|centre)', university, re.IGNORECASE):
                university = None

            education.append(clean_output({
                "degree": degree,
                "university": university,
                "date": date
            }))

        # === Work Experience ===
        if 'Designation' in tags:
            role = tags.get('Designation')
            company = tags.get('ORG') or tags.get('COMPANY') or tags.get('INSTITUTE')

            experience.append(clean_output({
                "designation": role,
                "organization": company if company else None,
                "date": date
            }))

    return {
        "Education": deduplicate(education),
        "Work Experience": deduplicate(experience)
    }

# ------------ Batch Folder Parser ------------ #
def process_resumes_in_folder(folder_path: str):
    for file_name in sorted(os.listdir(folder_path)):
        if not file_name.lower().endswith(('.pdf', '.docx')):
            continue

        file_path = os.path.join(folder_path, file_name)
        print(f"\n📄 Processing: {file_name}")

        try:
            text = extract_text(file_path)
            basic_info = extract_basic_info(text)
            structured = extract_resume_info_structured(text)

            print("\nBASIC INFO:")
            print(basic_info)

            print("\nEducation:")
            for edu in structured["Education"]:
                print("-", edu)

            print("\nWork Experience:")
            for work in structured["Work Experience"]:
                print("-", work)

        except Exception as e:
            print(f"⚠️ Error processing {file_name}: {e}")

# ------------ Run Parser ------------ #
# Replace with your resume folder path
process_resumes_in_folder("../test-resume")



📄 Processing: Achyuth_Resume.docx

BASIC INFO:
{'Name': 'Achyuth Rao P                                                                     \n\nSr', 'Email': 'rsda.sada-8@gmail.com', 'Phone Number': '+11244561232'}

Education:

Work Experience:
- {'designation': '/ Cloud Engineer', 'organization': None, 'date': None}
- {'designation': 'Build Engineer', 'organization': None, 'date': None}
- {'designation': 'Engineer', 'organization': None, 'date': 'Mar 2021 - Present'}
- {'designation': 'Engineer', 'organization': None, 'date': 'Aug 2020 – Mar2021'}
- {'designation': 'Engineer', 'organization': None, 'date': 'Jan 2019 - Nov 2020'}
- {'designation': 'Build Engineer', 'organization': None, 'date': 'Jan 2018 – July 2019'}
- {'designation': 'Linux System Administrator', 'organization': None, 'date': 'May 2016 - Jan 2018'}

📄 Processing: K Nishanth Java.docx

BASIC INFO:
{'Name': 'Nishanth K', 'Email': 'asdasfadaf@gmail.com', 'Phone Number': '4243243454'}

Education:
- {'degree': 'Bachelor O