# Resume Data - Entity Extraction and Graph Loading

In [1]:
from dotenv import load_dotenv
import os
load_dotenv('.streamlit/secrets.toml', override=True)

# Neo4j
NEO4J_URI = os.getenv('RESUME_NEO4J_URI')
NEO4J_USERNAME = os.getenv('RESUME_NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('RESUME_NEO4J_PASSWORD')

#OPENAI
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

## GenAI Setup

In [2]:
from langchain_openai import ChatOpenAI

llm=ChatOpenAI(temperature=0, model_name="gpt-4-0125-preview")

In [3]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm for resumes from job aspirants. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return an empty string, '', for the attribute's value."
            "Do not create fictitious data or impute missing values."
        ),
        ("human", "{text}"),
    ]
)

In [4]:
import datetime
from typing import List
from langchain_core.pydantic_v1 import BaseModel, Field

class Position(BaseModel):
    id: str = Field(description="Unique position id")
    title: str = Field(description="The job title")
    location: str = Field(description="Location of position")
    startDate: str = Field(description="Start date of position")
    endDate: str = Field(description="End date of position")
    description: str = Field(description="A crisp text summary of position that MUST NOT be more than 100 characters")
    company: str = Field(description="Name of company they worked the position for")

class Skill(BaseModel):
    id: str = Field(description="Unique skill id")
    name: str = Field(description="The name of the skill")
    level: str = Field(description="Experience level")

class Education(BaseModel):
    id: str = Field(description="Unique education id")
    degree: str = Field(description="Name of educational degree")
    institution: str = Field(description="Name of educational institution")
    location: str = Field(description="Location of educational institution")
    graduationDate: str = Field(description="Date of graduation")

class Person(BaseModel):
    id: str = Field(description="Unique person id")
    role: str = Field(description="The job/employment role")
    description: str = Field(description="A crisp text summary and MUST NOT be more than 250 characters")
    positions: List[Position]
    skills: List[Skill]
    education: List[Education]


In [5]:
chain = prompt | llm.with_structured_output(Person)

  warn_beta(


## Neo4j Setup

In [6]:
from langchain_community.graphs.neo4j_graph import Neo4jGraph

graph = Neo4jGraph(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)
graph.query('CREATE CONSTRAINT person_entityId IF NOT EXISTS FOR (p:Person) REQUIRE (p.entityId) IS UNIQUE;')

graph.query('CREATE CONSTRAINT position_entityId IF NOT EXISTS FOR (p:Position) REQUIRE (p.entityId) IS UNIQUE;')
graph.query('CREATE CONSTRAINT company_entityId IF NOT EXISTS FOR (p:Company) REQUIRE (p.entityId) IS UNIQUE;')
graph.query('CREATE CONSTRAINT job_title_entityId IF NOT EXISTS FOR (p:JobTitle) REQUIRE (p.entityId) IS UNIQUE;')

graph.query('CREATE CONSTRAINT skill_entityId IF NOT EXISTS FOR (p:Skill) REQUIRE (p.entityId) IS UNIQUE;')

graph.query('CREATE CONSTRAINT education_entityId IF NOT EXISTS FOR (p:Education) REQUIRE (p.entityId) IS UNIQUE;')

[]

## Extraction & Loading

In [12]:
TOTAL_DOCUMENTS = 4
LOAD_CHUNK_SIZE = 2

In [13]:
import glob
import re

def clean_text(t):
    return re.sub(r'[^\x00-\x7F]+',' ', t)

def chunks(xs, n: int = 1_000):
    n = max(1, n)
    return [xs[i:i + n] for i in range(0, len(xs), n)]


In [14]:
def format_list_dict(objs: BaseModel, source_id):
    res = []
    for obj in objs:
        d = obj.dict()
        d['sourceId'] =  source_id
        res.append(d)
    return res



def extract_data(txt_files):
    people = []
    positions = []
    skills = []
    educations = []
    failed_files = []

    for i in range(len(txt_files)):
        with open(txt_files[i], 'r', encoding='utf-8', errors='ignore') as file:
            text = clean_text(file.read().rstrip())
            try:
                person = chain.invoke(text)
                people.append({
                    'id':person.id,
                    'role': person.role,
                    'description': person.description,
                    'sourceId': txt_files[i]
                })
                positions.extend(format_list_dict(person.positions, txt_files[i]))
                skills.extend(format_list_dict(person.skills, txt_files[i]))
                educations.extend(format_list_dict(person.education, txt_files[i]))
                print(f"Successfully processed {i+1} of {len(txt_files)}")
            except Exception as e:
                print(f"{txt_files[i]}: Processing Failed with exception {e}")
                failed_files.append(txt_files[i])
    return people, positions, skills, educations, failed_files

def load_data(people, positions, skills, educations):
    for recs in chunks(people):
        graph.query('''
        UNWIND $recs AS rec
        MERGE(n:Person {entityId: rec.sourceId})
        SET n += rec
        RETURN count(n)
        ''', params={'recs': recs})

    for recs in chunks(positions):
        graph.query('''
        UNWIND $recs AS rec
        MATCH(p:Person {entityId: rec.sourceId})
        MERGE(n:Position {entityId: rec.sourceId + ' - ' + rec.id})
        SET n += rec
        MERGE(p)-[:HAS_POSITION]->(n)
        WITH n
        MERGE(j:JobTitle {entityId: toUpper(n.title)})
        MERGE(n)-[r:WITH_TITLE]->(j)
        WITH n
        WHERE n.company <> ""
        MERGE(c:Company {entityId: toUpper(n.company)})
        MERGE(n)-[r:AT_COMPANY]->(c)
        RETURN count(n)
        ''', params={'recs': recs})
    for recs in chunks(skills):
        graph.query('''
        UNWIND $recs AS rec
        MATCH(p:Person {entityId: rec.sourceId})
        MERGE(n:Skill {entityId: toUpper(rec.name)})
        MERGE(p)-[r:HAS_SKILL]->(n)
        SET r += rec
        RETURN count(r)
        ''', params={'recs': recs})
    for recs in chunks(educations):
        graph.query('''
        UNWIND $recs AS rec
        MATCH(p:Person {entityId: rec.sourceId})
        MERGE(n:Education {entityId: rec.sourceId + ' - ' + rec.id})
        MERGE(p)-[:HAS_EDUCATION]->(n)
        SET n += rec
        RETURN count(n)
        ''', params={'recs': recs})

In [15]:
%%time

from tqdm import tqdm

text_files = glob.glob("data/*.txt")[:TOTAL_DOCUMENTS]
failed_files_list = []

for txt_file_seg in tqdm(chunks(text_files, LOAD_CHUNK_SIZE)):
    print('======= Extracting Data From Files Segment ========')
    people, positions, skills, educations, failed_files = extract_data(txt_file_seg)
    print('Completed Extraction From Files Segment')
    failed_files_list.extend(failed_files)
    print('======= Loading Extracted Data ========')
    load_data(people, positions, skills, educations)


  0%|          | 0/2 [00:00<?, ?it/s]

Successfully processed 0 of 2
Successfully processed 1 of 2
Completed Extraction From Files Segment


 50%|█████     | 1/2 [00:22<00:22, 22.83s/it]

Successfully processed 0 of 2
Successfully processed 1 of 2
Completed Extraction From Files Segment


100%|██████████| 2/2 [01:07<00:00, 33.79s/it]

CPU times: user 98.5 ms, sys: 15.5 ms, total: 114 ms
Wall time: 1min 7s



