In [None]:
from dotenv import load_dotenv
import os
import requests
import json
from bs4 import BeautifulSoup

load_dotenv()

from langchain_google_vertexai import ChatVertexAI
from langchain.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate
)
from langchain_core.messages import SystemMessage
from langchain_google_community import DocAIParser
from langchain_core.document_loaders.blob_loaders import Blob

In [None]:
PROJECT_ID = os.getenv("PROJECT_ID")
REGION = os.getenv("REGION")
INDEX_NAME = os.getenv("INDEX_NAME")
INDEX_ID = os.getenv("INDEX_ID")
DB_USER= os.getenv("CLOUD_SQL_USER1")
DB_PASS = os.getenv("CLOUD_SQL_PASSWORD")
GCS_BUCKET_NAME = os.getenv("GCS_BUCKET_NAME")
DOC_PROCESSOR_NAME = os.getenv("DOC_PROCESSOR_NAME")

In [None]:
def scrape_webpage_content(url):
    """Scrapes the entire content of the given webpage."""
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to fetch page, status code: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')
    page_content = soup.get_text(separator='\n', strip=True)
    
    return page_content


def get_json_from_result(result):
    return json.loads(result.content[8:-4])

In [None]:
urls = [
    "https://www.linkedin.com/jobs/view/4157337860",
    "https://job-boards.greenhouse.io/scaleai/jobs/4413274005",
    "https://www.linkedin.com/jobs/view/4091428817",
]

In [None]:
chat_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                "You are a helpful assistant that helps to extract the job description from a corpus of words from a webpage"
                """
                You analyse the content provide, and extract only the job requirements and responsibilities.
                The content contains other irrelevant text extracted from the web page. Your job is to output only 
                the job requirements and responsibilities, skills, company. Do not modify the original words in the job description.
                Return the extracted job description only as a json object.
                Example:
                {
                    "job_description": "This is a job that does nothing"
                }
                """
            )
        ),
        HumanMessagePromptTemplate.from_template(
            "Kindly extract the job description from this: {webpage_content}"
        ),
    ]
)

ats_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                "You are a helpful assistant that helps review resumes for ATS compliance"
                """You ensure that you use the provided job description to make the resume more ATS friendly \n
                Do not remove quantifiable metrics found in the resume experience, but only modify the resume to include the keywords found in the job_description 
                Ensure that the work experience essence is not lost in the resume content is retained while modifying the sentences to semnatically include keywords in the job description\n
                Provide only content that should go into the final resume and nothing extra"""
                """You always respond in a structured format with the following subheadings: \n
                1. Summary \n
                2. Experience \n
                3. Skills \n
                4. Missing Keywords \n
                5. In this section only, list changes made to the resume to make it ATS friendly that was previously missing \n
                """
            )
        ),
        HumanMessagePromptTemplate.from_template(
            "Make my resume and work experience more ATS friendly: \n {resume_content} \n using the job's description: \n {job_description}"
        ),
    ]
)

In [None]:
def get_job_description(chat_template, web_url):

    webpage_content = scrape_webpage_content(web_url)
    chat_model = ChatVertexAI(model_name="gemini-1.5-pro")
    chain = chat_template | chat_model 
    result = chain.invoke({"webpage_content": webpage_content})
    
    # print(result.content)
    content_json = get_json_from_result(result)
    return content_json["job_description"]


def get_resume_content(filename="Olawale_Machine_Learning_Engineer_Template.pdf"):
    parser = DocAIParser(location="us",
                     processor_name=DOC_PROCESSOR_NAME,
                     gcs_output_path="gs://{}/resume-assistant/data/output/dev/pdfs".format(GCS_BUCKET_NAME))
    inp_path = f"gs://{GCS_BUCKET_NAME}/resume-assistant/data/input/dev/pdfs/{filename}"
    blob = Blob(path=inp_path)
    docs = list(parser.lazy_parse(blob))
    resume_content = ""
    for doc in docs:
        resume_content+=doc.page_content

    return resume_content

In [None]:
job1 = get_job_description(chat_template, urls[0])
print(job1)

In [None]:
resume_content = get_resume_content()
print(resume_content)

In [None]:
chat_model = ChatVertexAI(model_name="gemini-1.5-pro")
ats_chain = ats_template | chat_model 
ats_result = ats_chain.invoke({"job_description": job1,
                       "resume_content": resume_content})

In [None]:
print(ats_result.content)

In [None]:
print(job1)