### Repository Name: Resume Recommender

#### Description:
Resume Recommender Python Notebook powered by Large Language Models (LLM) via Azure OpenAI! This notebook demonstrates how to utilize state-of-the-art natural language processing techniques to recommend resumes for specific job requirements. Leveraging Azure's OpenAI framework, this project aims to streamline the recruitment process by accurately matching candidates to job descriptions.

In [None]:
# loading dependencies 

import os
import json
import os
import openai
import boto3
from botocore.exceptions import ClientError
import pandas as pd
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings, AzureOpenAI
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'



In [None]:
# Set the API endpoint, version, type, key, and deployment model
API_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
API_VERSION = os.getenv("OPENAI_API_VERSION")
API_TYPE = os.getenv("OPENAI_API_TYPE")
API_KEY = os.getenv("OPENAI_API_KEY")
DEPLOYMENT_MODEL = "gpt-35-turbo-16k"
COMPLETIONS_MODEL = "gpt-3.5-turbo-16k"
EMBEDDING_MODEL = "text-embedding-ada-002"
TEMP = 0.3  # temperature 

# Initialize the AzureChatOpenAI object for language generation
llm = AzureChatOpenAI(
    azure_endpoint=API_ENDPOINT,
    openai_api_version=API_VERSION,
    openai_api_key=API_KEY,
    openai_api_type=API_TYPE,
    deployment_name=DEPLOYMENT_MODEL,
    model_name=DEPLOYMENT_MODEL,
    tiktoken_model_name=COMPLETIONS_MODEL,
    temperature=TEMP
)

# Initialize the AzureOpenAIEmbeddings object for text embeddings
embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=API_ENDPOINT,
    openai_api_version=API_VERSION,
    openai_api_key=API_KEY,
    openai_api_type=API_TYPE,
    deployment=EMBEDDING_MODEL,
    model=EMBEDDING_MODEL
)


In [None]:
def load_docs(directory):
  """
  Load documents from the specified directory using unstructured parser

  Args:
    directory (str): The directory path where the documents are located.

  Returns:
    list: A list of loaded documents.
  """
  loader = DirectoryLoader(directory, show_progress=True, loader_kwargs={"strategy": "hi_res", "mode": "elements"})
  documents = loader.load()
  return documents

directory = 'C:/github/Resume chatbot/test_data/'
docs = load_docs(directory)
print(f"number of docs: {len(docs)}")


##### experimental part - in development
This part is not completed yet, the idea is to store summeries of multiple resumes rather then create embedding for a complete document.


In [None]:
def concat_documents_content(documents):
    """
    Concatenates the content of a list of document objects into a single string.

    Args:
        documents (list): A list of document objects.

    Returns:
        str: The concatenated content of the documents.
    """
    txt = ''
    for doc in documents:
        txt += f"\n {doc.page_content}"
    
    return txt


# Concatenate the content of the documents into a single string
txt = concat_documents_content(docs)

# Run the LLM
prompt = "the following text between the > and < signs is a complete resume text extracted from a document. You are an HR summarizer that should summerize the text and focus on the following key sub topics: Name, location, education, skills and tools, programming languages, and experience. you can add also additional information which will be a one line extra information from the document only that worth mentioning"
context = f"> {txt} <"
txt_summary = llm.invoke(prompt + context)
print(txt_summary.content)
    

#### ---> end of experimental code <----

In [None]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using Azure OpenAI embeddings 
from langchain.vectorstores import Chroma
vectordb = Chroma.from_documents(documents=docs, 
                                 embedding=embeddings,
                                 persist_directory=persist_directory)
# persiste the db to disk
vectordb.persist()
vectordb = None

# Now we can load the persisted database from disk, and use it as normal. 
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embeddings)

chromaretriever = vectordb.as_retriever(search_kwargs={"k": 2}) # by default search_type="similarity_score_threshold"

In [None]:

# Import the necessary libraries
import chromaretriever

# Define a function to retrieve the page content for a given query
def retrieve_page_content(query):
    """
    Retrieves the page content for a given query using chromaretriever.

    Args:
        query (str): The query to search for.

    Returns:
        str: The page content of the retrieved document.
    """
    # Invoke the chromaretriever to retrieve the documents
    docs = chromaretriever.invoke(query)

    # Return the page content of the first document
    return docs[0].page_content

# Example usage
query = "which candidate is good fit for data analysis using Tableau roles."
page_content = retrieve_page_content(query)
print(page_content)


# Example #2 
query = "Give name of candidate who is good fit for a Data Analyst roles"
page_content = retrieve_page_content(query)
print(page_content)


In [None]:
from langchain.chains import RetrievalQA

# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=llm, 
                                  chain_type="stuff", 
                                  retriever=chromaretriever, 
                                  verbose=True,
                                  return_source_documents=True)

## Cite sources
def process_llm_response(llm_response):
    """
    Process the LLM response and print the result and sources.

    Parameters:
    llm_response (dict): The LLM response containing the result and source documents.

    Returns:
    None
    """
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

##### A complete example
Here I'm going to create a complete example with a detailed prompt. 
To modify the scenario for every role, you should insert the job description of your case

In [None]:

# full example
warning = "We are looking at resumes of candidates for a position of data analyst and Business Intelligence (BI) developer. If you don't know the answer, just say that you don't know, don't try to make up an answer and do not mention skills which doesn't appear in the context!"
job_description = """
We are looking for candidates for a student type of position for a Business intelligence (BI) developer and Analyst to work with internal customers, to understand and articulate business processes, map them to business intelligence requirements, and implement them into indicators, reports, and dashboards.

As a BI Developer and analyst you will design, build, and maintain reports and dashboards to monitor company, product, and business performance and KPI's.

The BI developers work with other team members from across the world to develop BI solutions and provide support to the BI systems and ongoing analysis needs.
This includes understanding the data and where to get it, extracting the data, processing it, optimizations, and turning data into insights and graphical representation in the BI platform (Tableau).

Here are the main JOB Qualifications we mostly looking for in each candidate resume:
1. Studies degree in the field of industrial engineering, information systems, and /or Statistics.
2. Experience in databases and SQL
3. Experience in business intelligence and data warehousing concepts and methodologies
4. Excellent self-learning capabilities
5. Strong written and verbal communication skills and customer engagement skills
6. Experience in Python and software development - Advantage
7. Familiarity with data mining, machine learning, and predictive analytics algorithms, methodologies, and approaches - Advantage
8. Knowledge or experience with MS Excel, Tableau, Power BI - Advantage
9. Experience with AWS and cloud-based solutions in general - Advantage
10. Availability for 3 days a week

"""
question = warning+job_description + " Based on the given job description and the context you are provided with, which is the actual resumes of candidates for this job.:  "
query = question + " short list up to two resumes which are good fit based on match of: skills and tools, education and work experience mentioned in it. also you must provide the candidate name which usually will be mentioned in first line of pdf without subheading. next to each name mention the key reasons why you think he or she are the best fit. make sure not to list skills and information from the job description but from the resumes in the context. In case there are no good fit in the context resumes, list only one or mention that there are no good fit."

# query = question + "retrive the full document information of top 3 resumes which are good fit based on skills,education and work experience mwntioned in it? "
# query = "short list resumes which is good fit for Data analysis roles based on skills,education and work experience mwntioned in it?"

llm_response = qa_chain(query)
process_llm_response(llm_response)



In [None]:
resume_doc = chromaretriever.invoke(query)
print(resume_doc)

In [None]:
resume_doc = resume_doc[1].page_content
print(resume_doc)

In [None]:
db_records = vectordb.get()
print(db_records['documents'][0]) # you may want to change the key as needed. 