In [1]:
import os
import openai
import sys

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.schema import Document

from langchain.chains import RetrievalQA

from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from openai import OpenAI

from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings

from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import ChatPromptTemplate, PromptTemplate


from langchain_upstage import ChatUpstage
from langchain_upstage import UpstageEmbeddings # Embeddings
from langchain_core.messages import HumanMessage, SystemMessage

from langchain import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

from tqdm import tqdm

In [3]:
!rm -rf ./docs/chroma # remove old database files if any|

In [4]:
persist_directory = 'docs/chroma/'

# End to End Function

In [7]:
persist_directory = 'docs/chroma/'

filter_embedding = OllamaEmbeddings(model="EEVE-Korean-10.8B-FOR-FILTER:latest",
                                     base_url="http://172.17.0.8:11434")
filter_llm = ChatOllama(model="EEVE-Korean-10.8B-FOR-FILTER:latest", base_url="http://172.17.0.8:11434", temperature=0)

solar_embedding = UpstageEmbeddings(
    api_key="up_R7NKyxZ1MIMcogbWOvjsJij6108aS",
    model="solar-embedding-1-large"
)

solar_llm = ChatUpstage(api_key="up_R7NKyxZ1MIMcogbWOvjsJij6108aS", temperature=0)

chat_llm = ChatUpstage(api_key="up_R7NKyxZ1MIMcogbWOvjsJij6108aS", temperature=0)

In [8]:
file_list = [
    "contents/cv_mingyu.pdf",
    "contents/cv_hyunji.pdf",
    "contents/cv_agung.pdf", # example resume
    "contents/tmpy6pj9ef9.pdf"
]

In [9]:
def update_vector_db(candidate_num, file_name, filter_embedding=filter_embedding, filter_llm=filter_llm,
                     solar_embedding=solar_embedding, solar_llm=solar_llm):
    # Load data
    loader = PyPDFLoader(file_name)
    docs = loader.load()
    
    # setup splitter --> here we will use recursiveCharacterSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=150,
        separators=["\n\n", "\n", "(?<=\. )", " ", ""],
    )

    # get split result
    splits = text_splitter.split_documents(docs)
    # ---------------------------------------------------------
    # First Chain(FILTERING)
    # ---------------------------------------------------------
    
    map_prompt = """
    You are a helpful AI Filtering engine.
    You will be given a single passage of a document resume or curriculum vitae in Korean or English. 
    This section will be enclosed in triple backticks (```)
    AI Filtering engine task is to identify and remove any personal identifiable information (PII) such as phone numbers, emails, web addresses, and physical addresses.
    Specifically, if you find a name, replace it with "Candidate No.{candidate_num}". 
    While removing PII, ensure that all job-related information such as education, work experience, skills, and awards are accurately preserved and summarized. 
    Do not omit any important details related to the candidate's qualifications and experience but you should remove the PII except replaced name.
    Do not contain any other information that I didn't ask.

    ```{text}```
    FULL SUMMARY:
    """
    map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text", "candidate_num"])
    
    map_chain = load_summarize_chain(llm=filter_llm,
                             chain_type="stuff",
                             prompt=map_prompt_template)
    
    
    # Make an empty list to hold your summaries
    summary_list = []

    # Loop through a range of the lenght of your splits
    for i, doc in enumerate(splits):

        # Go get a summary of the chunk
        chunk_summary = map_chain.invoke({"input_documents": [doc], "text": doc.page_content, "candidate_num": candidate_num})

        # Append that summary to your list
        summary_text = chunk_summary['output_text']
        summary_list.append(summary_text)
        
        print(f"Preview: {summary_text}"+"...","\n")
        print("==================================================================")
    
    # ---------------------------------------------------------
    # Second Chain(SUMMARY)
    # ---------------------------------------------------------
    
    
    summaries = "\n".join(summary_list)
    
    # Convert it back to a document
    summaries = Document(page_content=summaries)
    
    combine_prompt = """
    You are a helpful AI SCOUT BOT! Your name is scouty.
    You will be given a series of summaries from a resume in Korean or English. 
    The summaries will be enclosed in triple backticks (```)
    AI SCOUT BOT goal is to give a verbose summary of less than 4000 characters in Korean only.
    Ensure that all important job-related information such as education, work experience, skills, and awards are included.
    Only answer it based on a given information.
    The reader should be able to grasp what happened in the document for hiring the candidate.
    Please begin the summary with the {candidate_num}!

    ```{text}```
    VERBOSE SUMMARY:
    """
    combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["candidate_num", "text"])
    
    resume_chain = load_summarize_chain(llm=solar_llm,
                             chain_type="stuff",
                             prompt=combine_prompt_template,
                             verbose=False # Set this to true if you want to see the inner workings
                             )
    

    output = resume_chain.invoke({"input_documents": [summaries], "candidate_num": candidate_num, "text": summaries})
    final_output = output['output_text']
    print(final_output)
    print("----------------------------------------------------------------")
    print(f"SUMMARY EVALUATION of {candidate_num} :")
    summary_score = evaluate_summary(final_output, summaries.page_content)
    print("----------------------------------------------------------------")

    # ---------------------------------------------------------
    # Third Chain (final chain to vector DB)
    # ---------------------------------------------------------
    
    # setup splitter --> here we will use recursiveCharacterSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=4000,
        chunk_overlap=100,
        separators=["\n\n", "\n", "(?<=\. )", " ", ""],
    )

    # get split result
    splits_output = text_splitter.split_text(final_output)

    vectordb = Chroma.from_texts(
        texts=splits_output,
        embedding=solar_embedding,
        persist_directory=persist_directory
    )
    
    return vectordb

In [10]:
for i, filename in tqdm(enumerate(file_list)):
    candidate_num = f"Candidate No.{i+1}"
    vectordb = update_vector_db(candidate_num=candidate_num, file_name=filename, 
                                filter_embedding=filter_embedding, filter_llm=filter_llm,
                                solar_embedding=solar_embedding, solar_llm=solar_llm)

0it [00:00, ?it/s]

Preview:  AI Filtering Engine Summary:

Candidate No.1 is an aspiring AI engineer with experience in various domains and strong communication and collaboration skills. They have consistently documented their learning journey on Notion and are passionate about acquiring new knowledge and skills through self-improvement and research. Recently, they have been interested in LLM, LoRA, RA G methodologies and have been actively participating in related studies.

Education: [Educational background information preserved]... 

Preview:  Candidate No.1 has a Master's degree in Industrial and Data Engineering from Pukyong & Pusan National Universities (2023.03). They also have a Bachelor's degree in System Management Engineering with a major in Technical Data Engineering from Pukyong National University (2017-2023.02).

Skills and Certifications:
- Data visualization, EDA, statistical analysis, modeling, result analysis, ML pipeline construction, reporting, communication, storytelling skills
- Pr

1it [00:43, 43.24s/it]

Preview:  Candidate No.2 is a machine learning enthusiast with an open mindset and adaptability, who has consistently documented their growth on Notion and blogs. They enjoy collaborating with diverse team members and have participated in various project activities. With a strong passion for continuous self-improvement and a spirit of challenge, they are committed to personal development.

Work Experience:
1. Dataedu (2020.12 - 2021.02): Worked at a data-driven technology company specializing in consulting and education services related to big data and AI. Contributed to the creation of an effective and accessible learning experience by collecting, processing, and organizing high-quality YouTube content for a new website.
2. Alchera (2023.03 - 2023.08): Gained hands-on experience in developing a fire prevention system model from data collection to model development using CVAT for labeling 연기 data, detailed labeling guidelines, and communication with labelers. Developed an OCR model usi

2it [01:27, 43.55s/it]

Preview:  Candidate No.3 is a highly motivated and curious data scientist with over four years of professional experience in transforming business goals into data-driven results through explanatory, diagnostic, predictive, and prescriptive analysis. They have expertise in various big data analytics tools and platforms such as Python, SQL, PySpark, TensorFlow, PyTorch, and Zeppelin. Additionally, they possess skills in deploying machine learning models to analytical dashboards or APIs and knowledge of microservices.

Education:
- Bachelor's degree in Engineering from the University of Indonesia (UI) with a GPA of 3.21/4.00
- Certificate in Data Science and Machine Learning Engineering from Purwadhika Digital Technology School with a score of 93.5/100
- Master's degree in Industrial and Data Science Engineering from Pukyoeng National University (Busan University of Technology) and PuSan National University, joint program with a GPA of 4.33/4.50
- Doctoral candidate in Industrial and Data

3it [02:02, 39.96s/it]

Preview:  Candidate No.4 is a SAP HCM consultant with experience in full cycle implementation projects at PT. Pelabuhan Indonesia Raya (PELINDO). They have expertise in various fields such as SAP Time Management, SAP Payroll, SAP Travel Management, SAP Organization Management, SAP Personnel Administration, SAP FIORI (ES/MS), and SAP Personnel Cost Planning. The candidate successfully completed a challenging project involving data and system integration between four previously separate companies, demonstrating strong strategic planning, public speaking, negotiation, teamwork, problem-solving, critical thinking, and time management skills. Additionally, they are proficient in four languages, which enhances their consultant capabilities and ability to work with diverse clients.

Work Experience:
Company: NTT DATA BUSINESS SOLUTIONS (Indonesia)
Duration: May 2022 - June 2023
Role: Junior SAP HCM Consultant
Project: PT. Pelabuhan Indonesia Raya
Phase 1 (Implementation of SAP HCM): June 2022

4it [02:30, 37.69s/it]


In [13]:
print(vectordb._collection.count())

4


In [21]:
# preparing our custom prompt
metrics_prompt_template = """
        You are a helpful AI SCOUT BOT! Your name is scouty.
        If the context is not relevant or is difficult to provide a specific definition or detailed understanding, 
        please answer the question by using your own knowledge about the topic
        
        If question asking related with calculate or evaluate candidate resumes You should following below instructions.
        
        Here are the metrics to evaluate candidate resumes. 
        For all candidates, please calculate the following metrics, and all metric scores should be in the range of 1(bad) to 5(good). 
        Please provide the results in a JSON format, where the keys are the candidate names, and the values are another JSON object containing the scores for each metric.

        Metrics to evaluate:
        (1). experience: Measure the total years of relevant work experience.
        (2). relevance: Assess how closely the applicant's experience aligns with the job domain.
        (3). education: Evaluate the level and relevance of the applicant's education to the job requirement.
        (4). skills: Rate the proficiency in key technical skills required for the job.

        The output should be in the following format:
        {{
            "Candidate Name 1": {{
                "experience": X,
                "relevance": Y,
                "education": Z,
                "skills": W,
            }},
            ...
        }}
        
        {context}
        
        Question: {question}
        """
Metrics_PROMPT = PromptTemplate(
                    template=metrics_prompt_template, 
                    input_variables=["context", "question"]
)

# we define memory for chat history
metrics_memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

# retrieval chain
metrics_qa = ConversationalRetrievalChain.from_llm(
    chat_llm,
    retriever=vectordb.as_retriever(search_type="mmr",
                                    search_kwargs={'k': 10, 'fetch_k': 50}),
    chain_type="stuff", # if want to use other chain type, you may need other parameter setup for ConversationRetrievalChain 
    memory=metrics_memory,
    combine_docs_chain_kwargs={"prompt": Metrics_PROMPT}
)

In [22]:
job_domains = ["team leader", "data scince", "machine learning"]
skills_list = ["python", "pytorch", "api"]
# education = "msc"
question = f"""
I should rank all candidates based on their background to be in job domain {job_domains} and have skills {skills_list} with scale in range 1 to 5.
"""
metrics_result = metrics_qa.invoke({"question": question})
metrics_result = metrics_result['answer'].replace('\n\n',' ').replace('\n',' ')
print(metrics_result)

Number of requested results 50 is greater than number of elements in index 4, updating n_results = 4


{   "Candidate No.1": {     "experience": 4,     "relevance": 4,     "education": 4,     "skills": 4   },   "Candidate No.2": {     "experience": 3,     "relevance": 3,     "education": 3,     "skills": 3   },   "Candidate No.3": {     "experience": 5,     "relevance": 5,     "education": 5,     "skills": 5   },   "Candidate No.4": {     "experience": 2,     "relevance": 2,     "education": 2,     "skills": 2   } }


In [23]:
# preparing our custom prompt
prompt_template = f"""
        You are a helpful AI SCOUT BOT! Your name is scouty. 
        please answer the question by using your own knowledge about the topic
        please only refer based on this metrics for ranking information of each cadidates : {metrics_result.replace("}",'').replace("{","")}
        please provide a concise answer in 5 sentences or less.
        At the end of your answer, explicitly state that you need to review it further.
        
        {{context}}
        
        Question: {{question}}
        """
PROMPT = PromptTemplate(
                    template=prompt_template, 
                    input_variables=["context", "question"]
)

# we define memory for chat history
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

# retrieval chain
qa = ConversationalRetrievalChain.from_llm(
    chat_llm,
    retriever=vectordb.as_retriever(search_type="mmr",
                                    search_kwargs={'k': 10, 'fetch_k': 50}),
    chain_type="stuff", # if want to use other chain type, you may need other parameter setup for ConversationRetrievalChain 
    memory=memory,
    combine_docs_chain_kwargs={"prompt": PROMPT}
)

In [24]:
question = """
직무 관련 점수를 고려했을 때, 지원자 중에 데이터 사이언티스트 팀 리더에 가장 적합한 후보자를 알려줘.
"""
result = qa.invoke({"question": question})
result['answer'].replace('\n\n',' ').replace('\n',' ')

Number of requested results 50 is greater than number of elements in index 4, updating n_results = 4


'제공된 정보를 기반으로, 후보자 3은 데이터 사이언스 분야에서 높은 동기와 호기심을 가진 전문가로서, 다양한 연구 프로젝트, 조직 활동, 그리고 다양한 수상 경력을 가지고 있습니다. 그들은 다양한 빅 데이터 분석 도구와 플랫폼에 대한 전문 지식을 가지고 있으며, 다양한 기술 스킬을 보유하고 있습니다. 따라서, 후보자 3은 데이터 사이언티스트 팀 리더에 가장 적합한 후보자로 보입니다. 그러나, 더 자세한 검토를 통해 최종 결정을 내리는 것이 좋습니다.'

In [25]:
question = """
다른 후보자와 비교했을 때, 해당 후보자의 강점이 뭐야?
"""
result = qa.invoke({"question": question})
result['answer'].replace('\n\n',' ').replace('\n',' ')

Number of requested results 50 is greater than number of elements in index 4, updating n_results = 4


'후보자 3은 다양한 연구 프로젝트, 조직 활동, 그리고 다양한 수상 경력을 가진 숙련된 데이터 과학자입니다. 그들은 광범위한 기술 스킬과 다양한 도구 및 프레임워크에 대한 숙련도를 가지고 있어 데이터 과학 관련 직무에 탁월한 후보자입니다.'

In [26]:
question = """
면접을 진행할 때에, 해당 후보자에게 면접에서 할만한 질문 3가지를 리스트업 해줘.
"""
result = qa.invoke({"question": question})
result['answer'].replace('\n\n',' ').replace('\n',' ')

Number of requested results 50 is greater than number of elements in index 4, updating n_results = 4


'1. 데이터 사이언스 프로젝트를 성공적으로 이끈 경험에 대해 이야기해주세요. 어떤 도전과제가 있었고, 어떻게 해결했나요? 2. 팀원들 간의 협업과 지식 공유를 어떻게 촉진하시나요? 팀 리더로서 어떤 역할을 맡으시나요? 3. 데이터 사이언스 분야에서의 최신 동향과 기술에 대해 어떻게 업데이트하고, 이를 팀에 적용하는 방법을 설명해주세요.'

In [27]:
question = """
후보자 3번 외에 다른 후보자를 한 사람만 더 추천해줘.
"""
result = qa.invoke({"question": question})
result['answer'].replace('\n\n',' ').replace('\n',' ')

Number of requested results 50 is greater than number of elements in index 4, updating n_results = 4


'후보자 1번은 다양한 분야에서 경험을 쌓은 AI 엔지니어로, 커뮤니케이션 및 협업 능력이 뛰어납니다. 그들은 지속적으로 노션에 학습 일지를 작성하며, 자기계발과 연구를 통해 새로운 지식과 기술을 습득하는 데 열정적입니다. 최근에는 LLM, LoRA, RA G 방법론에 관심을 가지고 관련 연구에 적극적으로 참여하고 있습니다.'

In [29]:
question = """
후보자 1번의 학력과 전공은 어떻게 돼?
"""
result = qa.invoke({"question": question})
result['answer'].replace('\n\n',' ').replace('\n',' ')

Number of requested results 50 is greater than number of elements in index 4, updating n_results = 4


'후보자 1번은 부경대학교와 부산대학교에서 산업 및 데이터 공학 석사 학위를 취득하였습니다 (2023.03). 또한, 부경대학교에서 시스템 관리 공학 학사 학위를 취득하였으며, 전공은 기술 데이터 공학입니다 (2017-2023.02).'