In [None]:
pdf_path = '../resumes/Parth_SD_Resume.pdf'

In [None]:
import pymupdf4llm
import pymupdf
from IPython.display import Markdown, display

file = pymupdf.open(pdf_path)

md_text = pymupdf4llm.to_markdown(
    file  
)

md_text

In [None]:
from doctr.models import ocr_predictor, from_hub
from doctr.io import DocumentFile

docs = DocumentFile.from_pdf(pdf_path)
model = from_hub('Felix92/doctr-torch-parseq-multilingual-v1')
predictor =  ocr_predictor(
    det_arch='fast_base',
    reco_arch=model,
    pretrained=True,
    assume_straight_pages=True,
    detect_orientation=False,
    
)

result = predictor(docs)
result.show()

In [None]:
text = result.render()

print(text)

In [None]:
from pypdf import PdfReader

with open(pdf_path, 'rb') as f:
    reader = PdfReader(f)
    content = "\n".join([x.extract_text() for x in reader.pages])

content

In [18]:
from typing import TypedDict
from gliner import GLiNER
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
import re

class Candidate(TypedDict):
    name: str
    phone: str
    email: str
    degree: str

ner = GLiNER.from_pretrained("urchade/gliner_large-v2.1")
labels = ["name", "phone number", "university/college", "college_degree"]
llm = ChatOpenAI(temperature=0.5)
chain = ChatPromptTemplate.from_messages([("system", "Given a sentence, answer the question in the most straightforward manner. Do not add additional information, just only say what the user has asked. If there is no answer, then only say \"None\"" ), ("human", "{proposition}\n\nWhat is the name of the college degree only?")]) | llm 

def extract_person_metadata(propositions: list[str]) -> Candidate:
    data: dict[str, list[set[str]]] = {
        "email": [set(), set()],
    }
    for proposition in propositions:
        found_email = re.search("(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])", proposition.lower())
        if found_email is not None:
            data["email"][1].add(found_email.group(0))
        entities = ner.predict_entities(proposition, labels, threshold=0.5)
        for entity in entities:
            key, value = entity["label"], entity["text"]
            if key not in data:
                data[key] = [set(), set()]

            match key:
                case "university/college":
                    value = value.lower()
                    if "college" in value or "university" in value:
                        data[key][1].add(value.replace("the", "").strip().title())
                case "college_degree":
                    value = value.lower()
                    if value not in data[key][1]:
                        data[key][0].add(proposition)
                        answer = chain.invoke({ "proposition": proposition }).content
                        if answer != "None":
                            data[key][1].add(answer)
                case _:
                    data[key][1].add(value)
    for key in data:
        data[key] = ", ".join([x.rstrip('.') for x in data[key][1]])
    return data

extract_person_metadata(["Chris Carr's phone number is (720)-326-8866.", "Chris Carr's email is Chriscoo2005@gmail.com.", 'Chris Carr is an avid learner and tech enthusiast.', 'Chris Carr has a strong foundation in game design, programming, and XR technologies.', "Chris Carr is eager to contribute to the University of Arizona's AI Core as an XR Dev.", "Chris Carr's familiarity in various tech disciplines combined with hands-on experience in immersive technologies aligns with the innovative objectives at the AI Core program.", 'Chris Carr is committed to further developing skills in AI.', 'Chris Carr is committed to sharing knowledge with peers to drive the Arizona economy into the AI era.', 'Chris Carr is a freshman at the University of Arizona.', 'Chris Carr is majoring in Game Design and Development.', 'Chris Carr has global travel experience.', 'Chris Carr has basic Japanese skills.', 'Chris Carr graduated from Canyon del Oro High School in the Class of 2023.', 'Chris Carr was an International Baccalaureate certificate student.', 'Chris Carr had a weighted G.P.A. of 4.18.', 'Chris Carr had an unweighted G.P.A. of 3.75.', 'Chris Carr scored a 3 on the AP Computer Science Principles Test.', 'Chris Carr scored a 3 on the AP Literature and Composition Test.', 'Chris Carr was on the High Honor Roll for several years.', 'Chris Carr received a student coding award.', 'Chris Carr was a recipient of the University of Arizona Wildcat Excellence Scholarship.', 'Chris Carr was a Canyon Del Oro High School Scholar.', 'Chris Carr received the Canyon Del Oro Academic Letter.', 'Chris Carr has solid experience in game design and development.', 'Chris Carr is proficient in Unity.', 'Chris Carr is eager to learn Unreal Engine 5.', 'Chris Carr has knowledge of VR experiences.', 'Chris Carr installed a home cable management system with pulleys for VR use.', 'Chris Carr 3D printed components for self-created board games.', 'Chris Carr has proficient programming skills including HTML, Python, JavaScript, and CSS.', 'Chris Carr engaged in computer modification projects.', 'Chris Carr has a deep understanding of computer hardware beneficial for AR/VR hardware.', 'Chris Carr demonstrated ability in technology repair and IT support.', 'Chris Carr has introductory experience with Generative AI tools.', 'Chris Carr is ready for further development and use of Generative AI tools in innovative college programming.', 'Chris Carr has creative competencies in graphic design, photography, and 3D modeling.', "Chris Carr's creative competencies are helpful skills for XR development.", 'Chris Carr has game design skills specifically in Unity and Blender.', 'Chris Carr is an avid gamer, both digital and tabletop, specifically Dungeons and Dragons.', "Chris Carr's gaming experience demonstrates an understanding of XR content and the ability to design.", 'Chris Carr worked at Oro Valley Home Depot focusing on customer service.', 'Chris Carr honed skills vital for teamwork and collaboration at Oro Valley Home Depot.', 'Chris Carr worked as a Law Office Assistant.', 'Chris Carr developed organizational skills and attention to detail as a Law Office Assistant.', 'Chris Carr is a gamer and game designer.', 'Chris Carr engages in home and car repair.', 'Chris Carr is an electronics hobbyist.', 'Chris Carr was a previous member of the German Club.', 'Chris Carr was a previous member of the Technology Student Association.', 'Chris Carr is a member of the University of Arizona Video Game Club.', 'Chris Carr is a Red Cross Ambassador.', 'Chris Carr participated in a mission trip in L.A. and Skid Row.', 'Chris Carr is involved in drone flight and R.C. hobbyism.', 'Chris Carr plays guitar.', 'Chris Carr is involved in music and video production.'])

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 94254.02it/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'email': 'chriscoo2005@gmail.com',
 'name': 'Chris Carr',
 'phone number': '(720)-326-8866',
 'university/college': 'University Of Arizona',
 'college_degree': 'Game Design and Development'}

In [None]:
# from gliner import GLiNER

# ner = GLiNER.from_pretrained("urchade/gliner_large-v2.1")

# labels = ["name"]

# entities = ner.predict_entities(, labels, threshold=0.1)

# for entity in entities:
#     print(entity["label"], "=>", entity["text"])
    