In [43]:
from transformers import pipeline

# Load NER pipeline
ner_pipeline = pipeline(
    "ner",
    model="dslim/bert-base-NER",
    aggregation_strategy="simple"  # important for clean entities
)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [44]:
resume_text = """
John Doe
Email: john.doe@email.com

Education:
Bachelor of Technology in Computer Science
Indian Institute of Technology Bombay

Experience:
Software Engineer at Google
Intern at Microsoft
"""


In [45]:
entities = ner_pipeline(resume_text)
entities


[{'entity_group': 'PER',
  'score': 0.99334896,
  'word': 'John Do',
  'start': 1,
  'end': 8},
 {'entity_group': 'MISC',
  'score': 0.5686403,
  'word': 'Technology',
  'start': 60,
  'end': 70},
 {'entity_group': 'MISC',
  'score': 0.85484236,
  'word': 'Computer Science',
  'start': 74,
  'end': 90},
 {'entity_group': 'ORG',
  'score': 0.9976185,
  'word': 'Indian Institute of Technology',
  'start': 91,
  'end': 121},
 {'entity_group': 'ORG',
  'score': 0.9835171,
  'word': 'Google',
  'start': 163,
  'end': 169},
 {'entity_group': 'ORG',
  'score': 0.9980179,
  'word': 'Microsoft',
  'start': 180,
  'end': 189}]

In [46]:
def extract_resume_fields(entities):
    name = None
    universities = set()
    companies = set()

    for entity in entities:
        label = entity["entity_group"]
        text = entity["word"]

        if label == "PER" and name is None:
            name = text

        if label == "ORG":
            # Heuristic: Universities usually contain keywords
            if any(keyword in text.lower() for keyword in ["university", "institute", "college"]):
                universities.add(text)
            else:
                companies.add(text)

    return {
        "Name": name,
        "University": list(universities),
        "Company": list(companies)
    }


In [47]:
parsed_data = extract_resume_fields(entities)
parsed_data


{'Name': 'John Do',
 'University': ['Indian Institute of Technology'],
 'Company': ['Microsoft', 'Google']}