In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

model_name = "manishiitg/resume-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

text = """
B.Tech in Computer Science at Amrita Vishwa Vidyapeetham (2021 – 2025)
Flutter Developer Intern at Go Eleventh Mile from Nov 2024 to Present
"""

ner_results = nlp(text)
for res in ner_results:
    print(f"{res['word']} → {res['entity_group']}")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


b → EducationDegree
tech → EducationDegree
amrita vishwa vidyapeetham → ORG
( 2021 – 2025 → DATE
flutter developer intern → Designation
nov 2024 → DATE
present → DATE


In [59]:
import os
import re
import docx2txt
import fitz  # PyMuPDF
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch

# ------------ Load Resume NER Model ------------ #
model_name = "manishiitg/resume-ner"
device = 0 if torch.cuda.is_available() else -1  # Use GPU if available

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

# ------------ Text Extraction ------------ #
def extract_text(file_path):
    ext = os.path.splitext(file_path)[-1].lower()
    if ext == ".pdf":
        doc = fitz.open(file_path)
        return "\n".join([page.get_text() for page in doc])
    elif ext == ".docx":
        return docx2txt.process(file_path)
    else:
        raise ValueError("Unsupported file type. Please use PDF or DOCX.")

# ------------ Basic Info (Regex) ------------ #
def extract_basic_info(text):
    name_match = re.search(r'(?i)^([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})', text)
    email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    phone_match = re.search(r'(\+?\d[\d\s\-\(\)]{8,})', text)

    return {
        "Name": name_match.group(1).strip() if name_match else None,
        "Email": email_match.group(0).strip() if email_match else None,
        "Phone Number": phone_match.group(0).strip() if phone_match else None
    }

def clean_output(obj):
    return {k: v.strip().title() if isinstance(v, str) and v else v for k, v in obj.items()}

# ------------ Structured Resume Info ------------ #
def extract_resume_info_structured(text):
    sentences = re.split(r'(?<=[\.\?\!\n])\s+', text)
    education = []
    experience = []

    for sent in sentences:
        ents = ner_pipeline(sent)
        if not ents:
            continue

        tags = defaultdict(str)
        dates = []

        for ent in ents:
            word = re.sub(r"\s+", " ", ent['word'].replace("##", "")).strip()
            entity_type = ent['entity_group']
            tags[entity_type] += word + " "
            if entity_type == 'DATE':
                dates.append(word)

        for key in tags:
            tags[key] = tags[key].strip()

        date = None
        if len(dates) >= 2:
            date = f"{dates[0]} - {dates[1]}"
        elif len(dates) == 1:
            date = dates[0]

        # === Education ===
        is_edu = (
            'EducationDegree' in tags or
            re.search(r'\b(b\.?tech|m\.?tech|bachelor|master|engineering|intermediate|school|college|university)\b', sent, re.I)
        )
        if is_edu and ('ORG' in tags or 'INSTITUTE' in tags):
            degree = tags.get('EducationDegree')
            university = tags.get('ORG') or tags.get('INSTITUTE')

            # Fallback
            if not degree:
                m = re.search(r'(b\.?tech|m\.?tech|bachelor|master|engineering)', sent, re.I)
                degree = m.group(0) if m else None

            if degree and university:
                education.append(clean_output({
                    "degree": degree,
                    "university": university,
                    "date": date
                }))

        # === Work Experience ===
        if 'Designation' in tags:
            role = tags.get('Designation')
            company = tags.get('ORG') or tags.get('COMPANY')

            # Fallback for freelance
            if not company:
                match = re.search(r'at\s+([A-Z][a-zA-Z]+)', sent)
                company = match.group(1) if match else None

            if role and company:
                experience.append(clean_output({
                    "designation": role,
                    "organization": company,
                    "date": date
                }))

    return {
        "Education": education,
        "Work Experience": experience
    }

# ------------ Main ------------ #
if __name__ == "__main__":
    resume_path = "../test-resume/Manoj Resume.pdf"  # Update path

    if not os.path.exists(resume_path):
        print(f"Error: File '{resume_path}' not found.")
    else:
        resume_text = extract_text(resume_path)
        parsed_info = extract_resume_info_structured(resume_text)

        print("\nEducation:")
        for edu in parsed_info["Education"]:
            print("-", edu)

        print("\nWork Experience:")
        for exp in parsed_info["Work Experience"]:
            print("-", exp)



Education:
- {'degree': 'Engineering', 'university': 'Sri Chaitanya College Of Education K K R S Go Ham School', 'date': '2021 – Present - 2019 – 2021'}

Work Experience:
- {'designation': 'Flutter Developer Intern', 'organization': 'Amrita Mind And Brain Center', 'date': 'November 2024 - Present'}
- {'designation': 'Hardware Security Researcher', 'organization': 'Redant', 'date': 'July 2023 - October 202'}
- {'designation': 'Flutter Developer', 'organization': 'Ostello Ai', 'date': 'June 2023 - September 202'}


In [None]:
import os
import re
import docx2txt
import fitz  # PyMuPDF
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# ------------ Load Resume NER Model ------------ #

model_name = "manishiitg/resume-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# ------------ Text Extraction ------------ #

def extract_text(file_path):
    ext = os.path.splitext(file_path)[-1].lower()
    if ext == ".pdf":
        doc = fitz.open(file_path)
        return "\n".join([page.get_text() for page in doc])
    elif ext == ".docx":
        return docx2txt.process(file_path)
    else:
        raise ValueError("Unsupported file type. Please use PDF or DOCX.")

# ------------ Basic Info (Regex) ------------ #

def extract_basic_info(text):
    name_match = re.search(r'(?i)^([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})', text)
    email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    phone_match = re.search(r'(\+?\d[\d\s\-\(\)]{8,})', text)

    return {
        "Name": name_match.group(1).strip() if name_match else None,
        "Email": email_match.group(0).strip() if email_match else None,
        "Phone Number": phone_match.group(0).strip() if phone_match else None
    }

def clean_output(obj):
    return {k: v.strip().title() if isinstance(v, str) and v else v for k, v in obj.items()}

# ------------ Helper: Fallback Date Extraction ------------ #

def extract_dates_from_text(text):
    matches = re.findall(r'(?i)(\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\.?\s*\d{4})', text)
    if len(matches) >= 2:
        return f"{matches[0]} - {matches[1]}"
    elif len(matches) == 1:
        return matches[0]
    return None

# ------------ Structured Resume Info ------------ #

def extract_resume_info_structured(text):
    sentences = re.split(r'(?<=[\.\?\!\n])\s+', text)

    education = []
    experience = []

    last_date = None
    last_org = None

    for sent in sentences:
        ents = ner_pipeline(sent)
        if not ents:
            continue

        tags = defaultdict(str)
        date_tokens = []

        for ent in ents:
            clean_word = re.sub(r"\s+", " ", ent['word'].replace("##", "")).strip()
            entity_group = ent['entity_group']

            if entity_group == 'DATE':
                date_tokens.append(clean_word)
            else:
                tags[entity_group] += clean_word + " "

        # Clean final tag strings
        for key in tags:
            tags[key] = tags[key].strip()

        # Date handling (NER or regex fallback)
        date = None
        if len(date_tokens) >= 2:
            date = f"{date_tokens[0]} - {date_tokens[1]}"
        elif len(date_tokens) == 1:
            date = date_tokens[0]
        else:
            date = extract_dates_from_text(sent)

        # Update last known values
        if date:
            last_date = date
        if 'ORG' in tags or 'INSTITUTE' in tags or 'COMPANY' in tags:
            last_org = tags.get('ORG') or tags.get('INSTITUTE') or tags.get('COMPANY')

        # === Education ===
        is_edu = (
            'EducationDegree' in tags or
            re.search(r'\b(b\.tech|m\.tech|bachelor|master|engineering|intermediate|school|college|university)\b', sent, re.IGNORECASE)
        )

        if is_edu:
            degree = tags.get('EducationDegree')
            org = tags.get('ORG') or tags.get('INSTITUTE') or last_org
            if degree or org:
                education.append(clean_output({
                    "degree": degree,
                    "university": org,
                    "date": date or last_date
                }))

        # === Work Experience ===
        if 'Designation' in tags:
            designation = tags['Designation']
            company = tags.get('ORG') or tags.get('COMPANY') or last_org
            if designation or company:
                experience.append(clean_output({
                    "designation": designation,
                    "organization": company,
                    "date": date or last_date
                }))

    return {
        "Education": education,
        "Work Experience": experience
    }

# ------------ Main ------------ #

if __name__ == "__main__":
    resume_path = "../test-resume/Siddharth_Reddy_Resume.pdf"  # Update to your path

    if not os.path.exists(resume_path):
        print(f"Error: File '{resume_path}' not found.")
    else:
        resume_text = extract_text(resume_path)
        basic_info = extract_basic_info(resume_text)
        parsed = extract_resume_info_structured(resume_text)

        print("\nBASIC INFO:")
        print(basic_info)

        for section, items in parsed.items():
            print(f"\n{section}:")
            for item in items:
                print(f"- {item}")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.



BASIC INFO:
{'Name': 'Manoj Parasuram \nSadanala', 'Email': 'manojparasuram.sadhanala@gmail.com', 'Phone Number': '9000830070'}

Education:
- {'degree': None, 'university': 'Sri Chaitanya College Of Education K K R S Go Ham School', 'date': '2021 – Present - 2019 – 2021'}
- {'degree': None, 'university': 'Amrita Mind And Brain Center', 'date': 'December 2021 – May 2024'}

Work Experience:
- {'designation': 'Flutter Developer Intern', 'organization': 'Amrita Mind And Brain Center', 'date': 'November 2024 - Present'}
- {'designation': 'Hardware Security Researcher', 'organization': 'Redant', 'date': 'July 2023 - October 202'}
- {'designation': 'Flutter Developer', 'organization': 'Ostello Ai', 'date': 'June 2023 - September 202'}
