In [None]:
from dotenv import load_dotenv
import os
import requests
import json
from bs4 import BeautifulSoup

load_dotenv()

from langchain_google_vertexai import ChatVertexAI
from langchain.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate
)
from langchain_core.messages import SystemMessage
from langchain_google_community import DocAIParser
from langchain_core.document_loaders.blob_loaders import Blob

In [None]:
PROJECT_ID = os.getenv("PROJECT_ID")
REGION = os.getenv("REGION")
INDEX_NAME = os.getenv("INDEX_NAME")
INDEX_ID = os.getenv("INDEX_ID")
DB_USER= os.getenv("CLOUD_SQL_USER1")
DB_PASS = os.getenv("CLOUD_SQL_PASSWORD")
GCS_BUCKET_NAME = os.getenv("GCS_BUCKET_NAME")
DOC_PROCESSOR_NAME = os.getenv("DOC_PROCESSOR_NAME")

In [None]:
def scrape_webpage_content(url):
    """Scrapes the entire content of the given webpage."""
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to fetch page, status code: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')
    page_content = soup.get_text(separator='\n', strip=True)
    
    return page_content


def get_json_from_result(result):
    return json.loads(result.content[8:-4])

In [None]:
urls = [
    "https://www.linkedin.com/jobs/view/4157337860",
    "https://job-boards.greenhouse.io/scaleai/jobs/4413274005",
    "https://www.linkedin.com/jobs/view/4091428817",
]

In [None]:
chat_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                "You are a helpful assistant that helps to extract the job description from a corpus of words from a webpage"
                """
                You analyse the content provide, and extract only the job requirements and responsibilities.
                The content contains other irrelevant text extracted from the web page. Your job is to output only 
                the job requirements and responsibilities, skills, company. Do not modify the original words in the job description.
                Return the extracted job description only as a json object.
                Example:
                {
                    "job_description": "This is a job that does nothing"
                }
                """
            )
        ),
        HumanMessagePromptTemplate.from_template(
            "Kindly extract the job description from this: {webpage_content}"
        ),
    ]
)

ats_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                "You are a helpful assistant that helps review resumes for ATS compliance"
                """You ensure that you use the provided job description to make the resume more ATS friendly \n
                Do not remove quantifiable metrics found in the resume experience, but only modify the resume to include the keywords found in the job_description 
                Ensure that the work experience essence is not lost in the resume content is retained while modifying the sentences to semnatically include keywords in the job description\n
                Provide only content that should go into the final resume and nothing extra"""
                """You always respond in a structured format with the following subheadings: \n
                1. Summary \n
                2. Experience \n
                3. Skills \n
                4. Missing Keywords \n
                5. In this section only, list changes made to the resume to make it ATS friendly that was previously missing \n
                """
            )
        ),
        HumanMessagePromptTemplate.from_template(
            "Make my resume and work experience more ATS friendly: \n {resume_content} \n using the job's description: \n {job_description}"
        ),
    ]
)

In [None]:
def get_job_description(chat_template, web_url):

    webpage_content = scrape_webpage_content(web_url)
    chat_model = ChatVertexAI(model_name="gemini-1.5-pro")
    chain = chat_template | chat_model 
    result = chain.invoke({"webpage_content": webpage_content})
    
    content_json = get_json_from_result(result)
    return content_json["job_description"]


def get_resume_content(filename="Olawale_Machine_Learning_Engineer_Template.pdf"):
    parser = DocAIParser(location="us",
                     processor_name=DOC_PROCESSOR_NAME,
                     gcs_output_path="gs://{}/resume-assistant/data/output/dev/pdfs".format(GCS_BUCKET_NAME))
    inp_path = f"gs://{GCS_BUCKET_NAME}/resume-assistant/data/input/dev/pdfs/{filename}"
    blob = Blob(path=inp_path)
    docs = list(parser.lazy_parse(blob))
    resume_content = ""
    for doc in docs:
        resume_content+=doc.page_content

    return resume_content

In [None]:
job1 = get_job_description(chat_template, urls[0])
print(job1)

In [None]:
resume_content = get_resume_content()
print(resume_content)

In [None]:
chat_model = ChatVertexAI(model_name="gemini-1.5-pro")
ats_chain = ats_template | chat_model 
ats_result = ats_chain.invoke({"job_description": job1,
                       "resume_content": resume_content})

In [None]:
print(ats_result.content)

In [None]:
print(job1)

In [None]:
from fpdf import FPDF

In [None]:
from google.cloud import storage

def upload_to_bucket(blob_name, path_to_file, bucket_name):
    """ Upload data to a bucket"""

    storage_client = storage.Client()

    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(path_to_file)

    return blob.public_url

In [None]:
upload_to_bucket(blob_name="filename.pdf",
                 path_to_file="",
                 bucket_name=GCS_BUCKET_NAME)

In [None]:
from google.cloud import storage

# Instantiates a client
storage_client = storage.Client()

# The name for the new bucket
bucket_name = "my-new-bucket"

# Creates the new bucket
bucket = storage_client.create_bucket(bucket_name)

print(f"Bucket {bucket.name} created.")

In [None]:
class PDF(FPDF):
    def header(self):
        self.set_font("Arial", style="B", size=16)
        self.cell(200, 10, "Professional Resume", ln=True, align='C')
        self.ln(10)
    
    def section_title(self, title):
        self.set_font("Arial", style="B", size=14)
        self.cell(0, 10, title, ln=True, align='L')
        self.ln(5)
    
    def section_body(self, text):
        self.set_font("Arial", size=12)
        self.multi_cell(0, 10, text)
        self.ln(5)

def save_to_pdf(content, filename="resume.pdf"):
    """Saves structured content to a PDF file."""
    pdf = PDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    
    # Adding sections
    pdf.section_title("Professional Summary")
    pdf.section_body("Experienced professional with a strong background in machine learning and software engineering.")
    
    pdf.section_title("Work Experience")
    pdf.section_body(content)  # Assuming scraped content is relevant job experience
    
    pdf.section_title("Skills")
    pdf.section_body("- Python\n- Machine Learning\n- Data Science\n- Web Scraping\n- Distributed Computing")
    
    pdf.output(filename)
    print(f"Content saved to {filename}")

In [None]:
save_to_pdf(ats_result.content)

In [None]:
import fitz  # PyMuPDF

def replace_text_in_pdf(input_pdf, output_pdf, search_text, replace_text):
    doc = fitz.open(input_pdf)
    
    for page in doc:
        text_instances = page.search_for(search_text)
        print("text_instances", len(text_instances), text_instances)
        
        if text_instances:
            page.add_redact_annot(text_instances[0], replace_text, fontname="helv", fontsize=11)
            page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE) 
    
    doc.save(output_pdf)
    doc.close()
    print(f"Modified PDF saved as {output_pdf}")


In [None]:
prev_text = """
CI/CD pipelines implemented using GitHub actions to ensure seamless continuous integration
and development, resulting in ~50% reduction in production downtimes
"""
new_text = """
Developed and productionised automated ML backend service with django backend for computer
"""

In [None]:
# replace_text_in_pdf(file_path, output_path, "SOTA", "new_text")
replace_text_in_pdf(file_path, output_path, prev_text, new_text)

In [None]:
file_path = "/home/olawale/Desktop/PROJECTS/llms/LLMS/resume-assistant/data/input/dev/pdfs/Olawale_Machine_Learning_Engineer_Template.pdf"
output_path = "/home/olawale/Desktop/PROJECTS/llms/LLMS/resume-assistant/data/input/dev/pdfs/Olawale_Machine_Learning_Engineer_Template_output.pdf"
output_docx = "/home/olawale/Desktop/PROJECTS/llms/LLMS/resume-assistant/data/input/dev/pdfs/Olawale_Machine_Learning_Engineer_Template_output.docx"

In [None]:
!pip install pdflatex

In [None]:
import pypandoc
def convert_docx_to_pdf(input_docx, output_pdf):
    pypandoc.convert_file(input_docx, 'latex', outputfile=output_pdf)
    print(f"Converted {input_docx} to {output_pdf}")

# Example usage
convert_docx_to_pdf(output_docx, output_path)

In [None]:
from pdf2docx import Converter

def convert_pdf_to_docx(input_pdf, output_docx):
    cv = Converter(input_pdf)
    cv.convert(output_docx, start=0, end=None)  # Convert all pages
    cv.close()
    print(f"Converted {input_pdf} to {output_docx}")

# Example usage
convert_pdf_to_docx(file_path, output_docx)

In [None]:
from pypdf import PdfReader

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file and returns it as a string."""
    reader = PdfReader(pdf_path)
    text = ""
    
    for page in reader.pages:
        text += page.extract_text() + "\n"  # Extract text from each page

    return text.strip()

In [None]:
pdf_text = extract_text_from_pdf(file_path)
print(pdf_text)

In [None]:
blob = Blob(data=pdf_text)

In [None]:
import base64

In [None]:
with open("/home/olawale/Desktop/PROJECTS/llms/LLMS/resume-assistant/filename.txt", "r") as file:
    data_read = file.read()

In [None]:
data_read

In [None]:
data_encode = data_read.encode("utf8").split(b";base64,")[1]
data_decode = base64.decodebytes(data_encode)
data_decode

In [None]:
from google.cloud import storage
# storage_client = storage.Client()
# bucket = storage_client.get_bucket(GCS_BUCKET_NAME)
# blob = bucket.blob("resume-assistant/data/output/dev/pdfs/test_bytes.pdf")
# blob.upload_from_string(pdf_text, content_type="application/pdf")
blob.public_url

In [None]:
from resume_assistant.application.dataset.upload import upload_to_gcs

upload_to_gcs("trial.pdf")