In [5]:
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_core.runnables import RunnableParallel,RunnablePassthrough
import os,asyncio,warnings
warnings.filterwarnings("ignore")

data = [file for file in os.listdir('data') if file.endswith(".pdf")]

In [10]:
from langchain.document_loaders import PyPDFLoader
from langchain_core.documents import Document
pages = PyPDFLoader(file_path=f'/home/faizan/Downloads/Get_Started_With_Smallpdf.pdf').load()

In [6]:
def flatteningdocs(data):
    return [item for sublist in data for item in sublist]

from langchain.document_loaders import PyPDFLoader
def get_data(file_path):
    pages = PyPDFLoader(file_path=f'data/{file_path}').load()
    if len(pages) > 1:
        pdfstring = ""
        metadata = {}
        for page in pages:
            pdfstring += page.page_content
            metadata.update(page.metadata)

        return [Document(
            page_content=pdfstring,
            metadata=metadata)]
        
    else:
        return pages

content_chain = RunnablePassthrough() | get_data
content = flatteningdocs(await content_chain.abatch(data))

In [7]:
len(content)

0

In [64]:
import pandas as pd
df = pd.read_csv('UpdatedResumeDataSet.csv')
# resumes = df['Resume'].tolist()
resumes = df[(df['Category'] == "Database") | (df['Category'] == "Data Science") | (df['Category'] == 'Python Developer')]

resumes.drop_duplicates(inplace=True)
resumes.shape

(27, 2)

In [67]:
from fpdf import FPDF

data_tuples = [tuple(x) for x in resumes.to_records(index=False)]

def save_to_pdf(data):
    for index,(category, resume_text) in enumerate(data):
        pdf = FPDF()
        pdf.add_page()
        pdf.set_font('Arial', size=10)
        pdf.set_title(f"{category}.pdf")
        
        # Encode resume_text to 'latin-1' to handle Unicode characters
        try:
            encoded_text = resume_text.encode('latin-1', 'replace').decode('latin-1')
        except Exception as e:
            print(f"Error encoding text for {category}.pdf: {e}")
            continue
        
        pdf.multi_cell(0, 10, encoded_text, border=1)
        
        # Save the PDF file
        pdf_file_name = f"data/{category}{index + 1}.pdf"
        pdf.output(pdf_file_name)

save_to_pdf(data_tuples)

In [29]:
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
embedding = HuggingFaceEmbeddings(model_name=model_name)
# vector_store = Chroma.from_texts(embedding=embedding,texts=resumes)
persist_directory = "database"
vector_store = Chroma.from_documents(embedding=embedding,documents=content,persist_directory=persist_directory)
vector_store

In [16]:

retriever = vector_store.as_retriever(
        search_kwargs={'k':20})

# retriever = vector_store.as_retriever(search_type="similarity_score_threshold",
        # search_kwargs={'score_threshold': 0.1,'k':30})
# print(len(getunique(retriever.invoke(des))))
len(retriever.invoke(des))

20

In [18]:
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.retrievers import ContextualCompressionRetriever
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_groq import ChatGroq
import os

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "CV"

Groq = ChatGroq(
    temperature=0,
    model="llama3-70b-8192").with_fallbacks([ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key=os.getenv("google_api_key"))])

# llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key=os.getenv("google_api_key")).with_fallbacks([ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key=os.getenv("dgoogle_api_key"))]).with_fallbacks([Groq])

from langchain_core.prompts import PromptTemplate

from langchain.output_parsers.boolean import BooleanOutputParser

prompt_template = """You are a powerfull assistant your task is to check weather the given CV match the given job requirements.Return only Yes if it match else return No.
<job description>{question}</job description>
# <cv>{context}</cv>
> Relevant (YES / NO):"""

def _get_default_chain_prompt() -> PromptTemplate:
    return PromptTemplate(
        template=prompt_template,
        input_variables=["question", "context"],
        output_parser=BooleanOutputParser())

_filter = LLMChainFilter.from_llm(Groq,prompt=_get_default_chain_prompt())

compression_retriever = ContextualCompressionRetriever(
    base_compressor=_filter, base_retriever=retriever)

compressed_docs = compression_retriever.invoke(des)

compressed_docs

[Document(metadata={'page': 4, 'source': 'data/Data Science8.pdf'}, page_content='Education Details \n B.Tech   Rayat and Bahra Institute of Engineering and Biotechnology\nData Science \nData Science\nSkill Details \nNumpy- Exprience - Less than 1 year months\nMachine Learning- Exprience - Less than 1 year months\nTensorflow- Exprience - Less than 1 year months\nScikit- Exprience - Less than 1 year months\nPython- Exprience - Less than 1 year months\nGCP- Exprience - Less than 1 year months\nPandas- Exprience - Less than 1 year months\nNeural Network- Exprience - Less than 1 year monthsCompany Details \ncompany - Wipro\ndescription - Bhawana Aggarwal\nE-Mail:bhawana.chd@gmail.com\nPhone: 09876971076\nVVersatile, high-energy professional targeting challenging assignments in Machine\nPROFILE SUMMARY\nâ–ª An IT professional with knowledge and experience of 2 years in Wipro Technologies in Machine\nLearning, Deep Learning, Data Science, Python, Software Development.\nâ–ª Skilled in managin

In [21]:
[os.remove(f'shortlisted_cvs/{file}') for file in os.listdir('shortlisted_cvs')]


[None, None, None, None, None]

In [9]:
# # Install package
# %pip install --upgrade --quiet "unstructured[all-docs]" --break-system-packages

Note: you may need to restart the kernel to use updated packages.


In [18]:
import os
os.makedirs('test',exist_ok=True)
os.makedirs("test/faizan",exist_ok=True)

In [19]:
import shutil
shutil.rmtree("test/faizan")

In [19]:
from langchain_core.runnables import RunnablePassthrough,RunnableParallel
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.output_parsers.boolean import BooleanOutputParser
import os

prompt_template = """
<job description>
{job_des}
</job description>
------------
<cv>
{cv}
</cv>

> Relevant (YES / NO):
"""

template = PromptTemplate(
        template=prompt_template,
        input_variables=["job_des", "cv"])

from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers.openai_tools import PydanticToolsParser

# Note that the docstrings here are crucial, as they will be passed along
# to the model along with the class name.
class predict_bool(BaseModel):
    """You are a powerful HR assistant. Your task is to review the given CV and determine if it matches the job requirements specified in the job description and give socre between 0 and 1 based on their relvancy.Please do your best it is very important to my career.if both or any of feild is empty then also return 0.
    Also give return the matching score betweeen 0 and 1
    > Relevant Score Between (0 and 1):
    """

    # selection: str = Field(..., description="YES/NO")
    score: str = Field(..., description="Give the score to the cv between 0 and 1")

google = ChatGoogleGenerativeAI(max_retries=0,temperature=0,model="gemini-1.5-flash",google_api_key=os.getenv("google_api_key"))

groq = ChatGroq(temperature=0,max_retries=0,model_name="mixtral-8x7b-32768").with_fallbacks([google])

groq2 = ChatGroq(api_key="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",temperature=0,max_retries=0,model_name="mixtral-8x7b-32768").with_fallbacks([groq])

llm_with_tools = groq.bind_tools([predict_bool],tool_choice="predict_bool")

job_des_cv_chain = RunnablePassthrough.assign(
selection_Bool = RunnablePassthrough.assign(
    source = (lambda x:x["source"]),
    job_des = (lambda x:x["job_des"]),
    cv = (lambda x:x["cv"])) | template | llm_with_tools | PydanticToolsParser(tools=[predict_bool]))

from utils import r_cv,ir_cv

# job_des_cv_chain.invoke({"job_des":des,"cv":r_cv,"source":"some"})

{'job_des': "Company Description\n \nEvergreen Future Tech (EFT) is a software solutions provider located in Islamabad, Pakistan. We offer cutting-edge software solutions and specialize in artificial intelligence technologies.\n\n Role Description\n \nThis is a contract role for an Artificial Intelligence Engineer. The AI Engineer will be responsible for developing and implementing AI technologies, such as pattern recognition, neural networks, and natural language processing. The engineer will also be involved in software development and computer science-related tasks. This role is an on-site position located in Islamabad, Pakistan.\n\n Qualifications\n \nStrong background in computer science and software development\nProficiency in pattern recognition and neural networks\nExperience in natural language processing (NLP)\nProblem-solving and analytical skills\nExperience with AI frameworks and tools\nKnowledge of machine learning algorithms\nExcellent coding skills in languages such as 

Failed to batch ingest runs: LangSmithRateLimitError('Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError(\'429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Monthly unique traces usage limit exceeded"}\')')
Failed to batch ingest runs: LangSmithRateLimitError('Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError(\'429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Monthly unique traces usage limit exceeded"}\')')


In [3]:
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_core.runnables import RunnablePassthrough
import os,asyncio,warnings,uuid
warnings.filterwarnings("ignore")

def flatteningdocs(data):
    # des = "we are looing for python developer just no more"
    flatten =  [item for sublist in data for item in sublist]
    return [{"source":cv.metadata['source'],"cv":cv.page_content,"job_des":des} for cv in flatten]

def get_data(file_path):
    try:
        pages = UnstructuredFileLoader(file_path=file_path).load()
        if pages[0].page_content:
            return pages
        
        return []
    
    except Exception as e:
        return []

pdfs_dir = "data/resumes/8cb73b16-615b-47d4-8c14-5735cd3bebfd"

pdfs_path = [f"{pdfs_dir}/{pdf}" for pdf in os.listdir(pdfs_dir)]

content_chain = RunnablePassthrough() | get_data
pdfs_content = flatteningdocs(await content_chain.abatch(pdfs_path[:5]))

In [40]:
def shortlist_cvs(cv_list, percentage):
    scored_cvs = [(cv.get("source"),cv.get("selection_Bool")[0].score) for cv in cv_list]
    
    # Sort CVs based on relevance scores in descending order
    scored_cvs.sort(key=lambda x: x[1], reverse=True)
    
    # Calculate the number of CVs to shortlist based on the percentage
    shortlist_count = int(len(cv_list) * percentage / 100)
    
    # Select the top N percent CVs
    shortlisted_cvs = scored_cvs[:shortlist_count]
    
    return shortlisted_cvs

In [54]:
res = job_des_cv_chain.batch(pdfs_content)

In [60]:
shortlist_cvs(res,percentage=60)

[('data/resumes/8cb73b16-615b-47d4-8c14-5735cd3bebfd/Python Developer78.pdf',
  '0.65'),
 ('data/resumes/8cb73b16-615b-47d4-8c14-5735cd3bebfd/Python Developer76.pdf',
  '0.6'),
 ('data/resumes/8cb73b16-615b-47d4-8c14-5735cd3bebfd/Python Developer74.pdf',
  '0.5')]

Failed to batch ingest runs: LangSmithConnectionError('Connection error caused failure to POST https://api.smith.langchain.com/runs/batch  in LangSmith API. Please confirm your internet connection.. SSLError(MaxRetryError("HTTPSConnectionPool(host=\'api.smith.langchain.com\', port=443): Max retries exceeded with url: /runs/batch (Caused by SSLError(SSLEOFError(8, \'EOF occurred in violation of protocol (_ssl.c:2406)\')))"))')
Failed to batch ingest runs: LangSmithRateLimitError('Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError(\'429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch\', \'{"detail":"Monthly unique traces usage limit exceeded"}\')')


In [63]:
from langchain_core.pydantic_v1 import BaseModel, Field

# Note that the docstrings here are crucial, as they will be passed along
# to the model along with the class name.
class predict_bool(BaseModel):
    """You are a powerful HR assistant. Your task is to review the given CV and determine if it matches the job requirements specified in the job description. Return only "YES" if it matches the requirements; otherwise, return "NO".Please do your best it is very important to my career.if both or any of feild is empty then also return NO."""

    selection: str = Field(..., description="Yes/No")

llm_with_tools = model.bind_tools([predict_bool])

llm_with_tools.invoke("HI how are you?")

AIMessage(content="Hello! I'm an AI and I don't have feelings, but I'm here to help you. How can I assist you today?\n\n(To use a tool, return a JSON object as a JSON string that matches the tool_use structure provided in the instructions. If no tool is needed, simply respond directly.)", response_metadata={'token_usage': {'completion_tokens': 70, 'prompt_tokens': 1245, 'total_tokens': 1315, 'completion_time': 0.111389901, 'prompt_time': 0.092694035, 'queue_time': None, 'total_time': 0.204083936}, 'model_name': 'mixtral-8x7b-32768', 'system_fingerprint': 'fp_c5f20b5bb1', 'finish_reason': 'stop', 'logprobs': None}, id='run-ca2b9983-aa20-4718-8cfc-cd77d7e155b8-0')

In [13]:
from transformers import pipeline

# Load a pre-trained language model for zero-shot classification
model_name = "facebook/bart-large-mnli"
classifier = pipeline("zero-shot-classification", model=model_name)

def score_cv(job_description, cv_text):
    # Define the labels
    labels = ["relevant", "not relevant"]
    # Classify the CV against the job description
    result = classifier(cv_text, candidate_labels=labels, hypothesis_template=f"This CV is {labels[0]} for the job description.")
    # Return the relevance score for "relevant"
    relevance_score = result["scores"][result["labels"].index("relevant")]
    return relevance_score

def shortlist_cvs(job_description, cv_list, percentage):
    scored_cvs = []
    
    for cv in cv_list:
        score = score_cv(job_description, cv)
        scored_cvs.append((cv, score))
    
    # Sort CVs based on relevance scores in descending order
    scored_cvs.sort(key=lambda x: x[1], reverse=True)
    
    # Calculate the number of CVs to shortlist based on the percentage
    shortlist_count = int(len(cv_list) * percentage / 100)
    
    # Select the top N percent CVs
    shortlisted_cvs = scored_cvs[:shortlist_count]
    
    return shortlisted_cvs

# Example usage
job_description = "Job description text here..."
cv_list = ["CV text 1...", "CV text 2...", "CV text 3..."]

# Shortlist the top 10% of CVs
percentage = 10
shortlisted_cvs = shortlist_cvs(job_description, cv_list, percentage)

# Output the shortlisted CVs and their scores
for cv, score in shortlisted_cvs:
    print(f"CV: {cv}, Score: {score:.2f}")




KeyboardInterrupt: 