# Automated Cv Macthing Phase One

In [None]:
from langchain_pinecone import PineconeVectorStore
from pydantic import BaseModel
import instructor
from openai import OpenAI
import requests
from langchain.indexes import SQLRecordManager,index
from langchain_community.document_loaders import (
    PyPDFLoader,
    S3FileLoader,
    S3DirectoryLoader,
    Docx2txtLoader
    )
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
import os
import pathlib

os.environ['PINECONE_API_KEY'] = ''
api_keys = ''

index_name = "job-descriptions"

embeddings =OpenAIEmbeddings(api_key=api_keys)
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=6000,
        chunk_overlap=500,
        length_function=len,
        is_separator_regex=False,
)


In [None]:
### file loader func
## Phase 1 (one time process)



def embed_load_document_to_pincone(file_path): 
    """
    load job description to pinecone
    check for file extension
    if pdf load pdf loader
    if docx load docx loader
    
    """ 
    ## initializing langchain indexing
    source = 'resume'
    namespace = f'pincone-{source}'
    record_manager = SQLRecordManager(namespace, db_url=f'sqlite:///resume.sql')
    record_manager.create_schema()
    
    
    ## checking file_path extension
    file_extention = pathlib.Path(file_path).suffix
    if file_extention == ".pdf":
        pdf_loader =  PyPDFLoader(file_path)
        doc = text_splitter.split_documents(pdf_loader.load())
    else:
        word_loader = Docx2txtLoader(file_path)
        doc = text_splitter.split_documents(word_loader.load())
        
        
    ## indexing document Avoid writing duplicated content into the vector store
    store = PineconeVectorStore(text_key='text',embedding=embeddings, index_name=index_name)
    index(doc, record_manager, store, cleanup="incremental", source_id_key='source')
    
        
    for docs in doc : print(docs.page_content) 
    
   
    ##  loading document to pincone vector 
    vectostore = PineconeVectorStore.from_documents(doc, embeddings, index_name=index_name)
    
    
    print("document uploaded to pincone db.........")
    
embed_load_document_to_pincone("samplefile/wordJobDescription.docx")

## Automated Cv Macthing Phase two

In [3]:
### file loader extractor func
## Phase 2 (batch and realtime)

def resume_loader(storage_name=None, localfile=None):
  
    ## checking storage type s3 or dropbox
    if storage_name == 's3':
      s3_loader = S3DirectoryLoader(
        'resume-bucket',
        prefix="resume",
        aws_access_key_id="", 
        aws_secret_access_key=""
        )
      doc = text_splitter.split_documents(s3_loader.load())
      return doc
    
    elif storage_name == 'dropbox':
        pass
    else:
      
       ## checking file_path extension
      if localfile:
        file_extention = pathlib.Path(localfile).suffix
        if file_extention == ".pdf":
            pdf_loader =  PyPDFLoader(localfile)
            doc = text_splitter.split_documents(pdf_loader.load())
            
        else:
            word_loader = Docx2txtLoader(localfile)
            doc = text_splitter.split_documents(word_loader.load())
          
        #for docs in doc : print(docs.page_content) 
        return doc
          
resume_loaders = resume_loader(localfile='samplefile/Owolabi_Akintan_resume.pdf')





In [4]:
 ## Similarity search 

client = instructor.from_openai(OpenAI(api_key=api_keys))


class MatchResume(BaseModel):
    """
    instructor response_model class
    
    """
    resume: str 
    

def job_description_augment_prompt_match(resume_query):
    
    # get top 3 results from knowledge base
    
    vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
    results = vectorstore.similarity_search_with_relevance_scores(resume_query, k=3)
    
    ## retrive result and score and perform rag process using instructor  
    for result, score in results:
        
        source_knowledge = f"Document: {result.page_content},\n Score: {score}"
        
        resp = client.chat.completions.create(
            model="gpt-3.5-turbo",
            response_model=MatchResume,
            messages=[
                {"role": "system", "content": "You are a resume job description matcher. If Resume does not match the job description, return 'No Match'."},
                {"role": "user", "content": f"using the Contexts: Job Description: \n {str(source_knowledge)} \n  match the following Resumes : {resume_query}"}
            ]
        )
     

    return resp.model_dump() 


In [None]:
# Webhook api


def webhook_api(payload):
    header = {'Content-Type':'application/json',
              'X-Api-Key':'ef9f9fb91d819f7fae0fda9da47b11116e28564c49cd60d2017917ca99b6c3f4'}
    url = "https://sapphire.contextdata.ai/api/webpush/c5-000001uhj6bd/"
    response = requests.post(url,json=payload,headers=header)
    return response.status_code


## send match resume to webhook api endpoint
for don in resume_loaders:
    matcher_resume_payload = job_description_augment_prompt_match(don.page_content[:]) 
    print("uploading match resume to webhook payload")
    webhook_api(matcher_resume_payload)
    

## Chat with Your PDf

In [50]:
from langchain_pinecone import PineconeVectorStore
from openai import OpenAI
import os
import instructor
from pydantic import BaseModel
from langchain_community.document_loaders import (
    PyPDFLoader,
    )
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

os.environ['PINECONE_API_KEY'] = ''
api_keys = ''
index_name = "chat"

embeddings =OpenAIEmbeddings(api_key=api_keys)
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
)

In [63]:

client = instructor.from_openai(OpenAI(api_key=api_keys))


class PfdResponse(BaseModel):
    """
    instructor response_model class for pdf response
    
    """
    text: str 
    page: int
    source:str
    
    

def load_pdf(file_path):
    
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    doc = text_splitter.split_documents(pages)
    vectostore = PineconeVectorStore.from_documents(doc, embeddings, index_name=index_name)
    
    return vectostore
    

def query_pdf(query):
    vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
    results = vectorstore.similarity_search_with_relevance_scores(query, k=3)
    
    for result, _ in results:
        source_knowledge = f"text: {result.page_content},\n Page: {result.metadata['page']}, \n Source: {result.metadata['source']}"
        
        resp = client.chat.completions.create(
            model="gpt-3.5-turbo",
            response_model=PfdResponse,
            messages=[
                {"role": "system", "content": "You are a pdf summarizer assistant. If does not match the query description, dont generate random once just return 'No such document'."},
                {"role": "user", "content": f"using the text: \n {str(source_knowledge)} \n  to answer the query: {query}"}
            ]
        )
     

    return resp.model_dump() 
    
        
    
    
    
res = query_pdf('management to enable dietary freedom in people with type')

res

    

{'text': 'The type 1 diabetes story is one of the power of self-management. The tools for this are technological, but the challenge is social and psychological. We now know that structured education works, but it is more complicated and less glamorous than a new drug, and so isn’t currently given the attention that it demands. We also know that education on its own doesn’t do the job. Teaching people to do things in a new way only gives them another option. While some people with diabetes embrace the flexibility of insulin that fits into their lives, for others, this means more uncertainty and more cause for concern. As with any other aspect of life, different people have different approaches to routines, and they have wildly varying everyday circumstances.\nDiabetes education and self-management are about more than',
 'page': 27,
 'source': 'samplefile\\pdfquery\\politics_talk.pdf'}