In [15]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from tqdm.notebook import tqdm
import time
import itertools

In [16]:
local_path = "../pdf/Mateo Wheeler Resume v2.pdf"

if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    print("Upload a PDF file for processing.")

In [17]:
len(data[0].page_content)

6828

In [18]:
#Split and chunk the data
chunk_size = 150
chunk_overlap = 75

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunks = text_splitter.split_documents(data)

# Add the chunks to vector database, which uses nomic for model embeddings
vector_db = Chroma.from_documents(
                                    documents=chunks, 
                                    embedding=OllamaEmbeddings(model="nomic-embed-text"),
                                    collection_name="local-rag"
                                )

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [19]:
local_llm = "llama3.2"
llm = ChatOllama(model=local_llm)

# Set up a basic PromptTemplate as the backbones of the solution
QUERY_PROMPT = PromptTemplate(
    input_variables = ["question"],
        template="""As a recruiter, you need to evaluate if a candidate is a good fit for a role. 
                If you don't know the answer, just say that you don't know, don't try to make up an answer.
                Question: {question}
           """
)

retriever = MultiQueryRetriever.from_llm(vector_db.as_retriever(),llm, prompt=QUERY_PROMPT)

# use a ChatPromptTemplate to initiate a conversation, allowing the System to assume a Role
chat_template = """Answer the question based only on the following context: 
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(chat_template)

chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt 
    | llm 
    | StrOutputParser()
)

In [20]:
# copy company_description and job_description from LinkedIn job posts
company_description = """Sift equips healthcare providers and revenue cycle managers with a complete payments analytics platform making it easy to visualize and understand payment trends, prioritize RCM workflows and accelerate cash flow.
Sift improves data clarity and optimizes the financial performance of the entire revenue cycle continuum. Meaningful insights help reduce denials, increase patient payments, maximize reimbursements and reduce time and cost to collect.
"""

job_description = """Five (5) plus years of experience working with large disparate data sets, using machine learning to derive healthcare insights
Experience with deep learning / GenAI
NLP, LLMs, Transformers, LSTM, GNNs, etc.
Experience with productionization is a plus.
Experience with the following concepts: tree-based models (Random Forest, GBM, XGBoost, etc.), clustering models (K-means, KNN, KAMILA, DBSCAN, etc.), forecasting (ARIMA, Prophet, etc.), regression, anomaly detection, etc.
Experience working with temporal, cross-sectional, and unstructured data
Experience with git, including conducting & participating in code reviews
Experience with Python and SQL
Experience working with healthcare data
EMR, Claims, Midcycle Revenue, Clinical or Coding data experience a plus
Strong written and oral communication skills
Advanced degree (MS or higher) in applied data science, statistics, economics, computer science, computational natural sciences, or a related field
                """

In [21]:
context = f"Role: you are a recruiter for {company_description}. The job position requirements are the following:\
            {job_description}\
        "

q = "Can you evaluate the candidate in this resume document for the role provided?"

response = chain.invoke(input={'context': context, 'question': q})
print(response)

Based on the context provided, it appears that the candidate has relevant experience in machine learning, deep learning, and natural language processing (NLP), as well as experience with data visualization, productionization, and healthcare data. The job requirements mention a strong background in business analytics, data science, and statistics.

The candidate's resume documents skills such as:

* Experience with large disparate data sets
* Deep learning/GenAI
* NLP, LLMs, Transformers, LSTM, GNNs, etc.
* Productionization experience
* Knowledge of tree-based models (Random Forest, GBM, XGBoost), clustering models (K-means, KNN, DBSCAN), forecasting (ARIMA, Prophet), regression, and anomaly detection

The candidate also has strong written and oral communication skills, an advanced degree in a related field, and experience working with healthcare data (EMR, Claims, Midcycle Revenue, Clinical or Coding data).

Considering the role of Sift equips healthcare providers and revenue cycle ma