In [None]:
# import os
# import json
# from pdf2image import convert_from_path
# import pytesseract
# from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
# from ibm_watson_machine_learning.foundation_models import Model
# from ibm_watson_machine_learning.foundation_models.utils.enums import (
#     DecodingMethods,
# )

In [None]:
from pdf2image import convert_from_path
import pytesseract

# Function to extract text from PDF
def parse_pdf_pytesseract(pdf_path, page_number=0) -> str:
    images = convert_from_path(
        pdf_path, first_page=page_number + 1, last_page=page_number + 1
    )
    if not images:
        return "Failed to convert PDF to image"
    page_image = images[0]
    extracted_text = pytesseract.image_to_string(page_image)
    return extracted_text

In [None]:
# Function to initialize the model
def initialize_model(model_id):
    generate_params = {
        GenParams.DECODING_METHOD: DecodingMethods.GREEDY,
        GenParams.MAX_NEW_TOKENS: 1024,
    }
    model = Model(
        model_id=model_id,
        params=generate_params,
        credentials={"apikey": os.getenv("GA_API_KEY"), "url": os.getenv("GA_URL")},
        project_id=os.getenv("GA_PROJECT_ID"),
    )
    return model

In [None]:
# Function to extract resume information
def resume_extraction(parsed_pdf_file, model_id, prompt):
    with open(parsed_pdf_file.replace(".pdf", ".txt"), "w") as file:
        file.write(parse_pdf_pytesseract(parsed_pdf_file, 0))

    with open(parsed_pdf_file.replace(".pdf", ".txt")) as file:
        text = file.read()

    parsed_pdf_file = os.path.basename(parsed_pdf_file)
    text += "\n" + "File Name: " + parsed_pdf_file + "\n"
    # print("text: ", text)
    prompt = prompt.format(filename=parsed_pdf_file, text=text)

    with open("r_prompt.txt", "w") as file:
        file.write(prompt)

    input_tokens = 0
    input_tokens += len(prompt)

    model = initialize_model(model_id)
    inferred = []
    output_tokens = 0

    resp = model.generate(prompt=prompt)["results"][0]["generated_text"]
    output_tokens += len(resp)

    try:
        first, last = resp.find("{"), resp.rfind("}")
        resp = json.loads(resp[first : last + 1], strict=False)
        inferred.append(resp)
    except json.JSONDecodeError as e:
        print("Invalid JSON:", e)

    return input_tokens, output_tokens, inferred

In [None]:
# Sample usage
pdf_path = "/Users/charan/VS_Code/EMEA/db-schenker-entity-extraction/src/entity_extraction/Charana_H_U_Resume_2024.pdf"
model_id = "meta-llama/llama-3-70b-instruct"
resume_extraction_prompt_input = """[INST] You are an information extraction assistant. Your task is to extract the following information in JSON format:
- Name: "name"
- Email: "email"
- Phone Number: "phone_number"
- Current Organization: "current_organization"
- Years of Experience: "years_experience"
- Skills: "skills"

Use this syntax for your response:
{{
    "filename": {filename},
    "name": "...",
    "email": "...",
    "phone_number": "...",
    "current_organization": "...",
    "years_experience": "...",
    "skills": "..."
}}


Input: {text}

Output:"""

# Extract information from the resume
input_tokens, output_tokens, inferred = resume_extraction(
    pdf_path, model_id, resume_extraction_prompt_input
)
print("Input Tokens:", input_tokens)
print("Output Tokens:", output_tokens)
print("Inferred Information:", inferred)

In [None]:
import os
import json
import sqlite3
from pdf2image import convert_from_path
import pytesseract
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.utils.enums import (
    DecodingMethods,
)


# Function to extract text from PDF
def parse_pdf_pytesseract(pdf_path, page_number=0) -> str:
    images = convert_from_path(
        pdf_path, first_page=page_number + 1, last_page=page_number + 1
    )
    if not images:
        return "Failed to convert PDF to image"
    page_image = images[0]
    extracted_text = pytesseract.image_to_string(page_image)
    return extracted_text


# Function to initialize the model
def initialize_model(model_id):
    generate_params = {
        GenParams.DECODING_METHOD: DecodingMethods.GREEDY,
        GenParams.MAX_NEW_TOKENS: 1024,
    }
    model = Model(
        model_id=model_id,
        params=generate_params,
        credentials={"apikey": os.getenv("GA_API_KEY"), "url": os.getenv("GA_URL")},
        project_id=os.getenv("GA_PROJECT_ID"),
    )
    return model


# Function to extract resume information and insert into SQLite
def resume_extraction_and_insert(parsed_pdf_file, model_id, prompt, db_path):
    # Extract text from PDF
    with open(parsed_pdf_file.replace(".pdf", ".txt"), "w") as file:
        file.write(parse_pdf_pytesseract(parsed_pdf_file, 0))

    with open(parsed_pdf_file.replace(".pdf", ".txt")) as file:
        text = file.read()

    parsed_pdf_file = os.path.basename(parsed_pdf_file)
    text += "\n" + "File Name: " + parsed_pdf_file + "\n"
    prompt = prompt.format(filename=parsed_pdf_file, text=text)

    with open("r_prompt.txt", "w") as file:
        file.write(prompt)

    # Initialize Watsonx model
    model = initialize_model(model_id)

    # Generate response
    resp = model.generate(prompt=prompt)["results"][0]["generated_text"]

    try:
        # Extract JSON from response
        first, last = resp.find("{"), resp.rfind("}")
        resp_json = json.loads(resp[first : last + 1], strict=False)

        # Extract information from JSON
        name = resp_json.get("name", "")
        email = resp_json.get("email", "")
        phone_number = resp_json.get("phone_number", "")
        current_organization = resp_json.get("current_organization", "")
        years_experience = resp_json.get("years_experience", "")
        skills = resp_json.get("skills", "")

        # Insert into SQLite database
        conn = sqlite3.connect(db_path)
        c = conn.cursor()

        # Create table if not exists
        c.execute(
            """CREATE TABLE IF NOT EXISTS resumes
                     (name TEXT, email TEXT, phone_number TEXT,
                      current_organization TEXT, years_experience TEXT, skills TEXT)"""
        )

        # Insert extracted data into the table
        c.execute(
            "INSERT INTO resumes VALUES (?, ?, ?, ?, ?, ?)",
            (name, email, phone_number, current_organization, years_experience, skills),
        )

        conn.commit()
        conn.close()

        inferred = {
            "name": name,
            "email": email,
            "phone_number": phone_number,
            "current_organization": current_organization,
            "years_experience": years_experience,
            "skills": skills,
        }

        return inferred

    except json.JSONDecodeError as e:
        print("Invalid JSON:", e)
        return 0, 0, {}


# Sample usage
pdf_path = "/Users/charan/VS_Code/EMEA/db-schenker-entity-extraction/src/entity_extraction/s-2.pdf"
model_id = "meta-llama/llama-3-70b-instruct"
resume_extraction_prompt_input = """[INST] You are an information extraction assistant. Your task is to extract the following information in JSON format:
- Name: "name"
- Email: "email"
- Phone Number: "phone_number"
- Current Organization: "current_organization"
- Years of Experience: "years_experience"
- Skills: "skills"

Use this syntax for your response:
{{
    "filename": {filename},
    "name": "...",
    "email": "...",
    "phone_number": "...",
    "current_organization": "...",
    "years_experience": "...",
    "skills": "..."
}}


Input: {text}

Output:"""

db_path = "resume_data.db"

# Extract information from the resume and insert into SQLite
inferred = resume_extraction_and_insert(
    pdf_path, model_id, resume_extraction_prompt_input, db_path
)
print("Inferred Information:", inferred)

In [None]:
import os
import json
import sqlite3
from pdf2image import convert_from_path
import pytesseract
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.utils.enums import (
    DecodingMethods,
)


# Function to extract text from PDF
def parse_pdf_pytesseract(pdf_path, page_number=0) -> str:
    images = convert_from_path(
        pdf_path, first_page=page_number + 1, last_page=page_number + 1
    )
    if not images:
        return "Failed to convert PDF to image"
    page_image = images[0]
    extracted_text = pytesseract.image_to_string(page_image)
    return extracted_text


# Function to initialize the model
def initialize_model(model_id):
    generate_params = {
        GenParams.DECODING_METHOD: DecodingMethods.GREEDY,
        GenParams.MAX_NEW_TOKENS: 1024,
    }
    model = Model(
        model_id=model_id,
        params=generate_params,
        credentials={"apikey": os.getenv("GA_API_KEY"), "url": os.getenv("GA_URL")},
        project_id=os.getenv("GA_PROJECT_ID"),
    )
    return model


# Function to check if email exists in SQLite database
def check_email_exists(conn, email):
    c = conn.cursor()
    c.execute("SELECT COUNT(*) FROM resumes WHERE email=?", (email,))
    result = c.fetchone()
    return result[0] > 0


# Function to insert or update resume information in SQLite
def resume_extraction_and_insert(parsed_pdf_file, model_id, prompt, db_path):
    # Extract text from PDF
    with open(parsed_pdf_file.replace(".pdf", ".txt"), "w") as file:
        file.write(parse_pdf_pytesseract(parsed_pdf_file, 0))

    with open(parsed_pdf_file.replace(".pdf", ".txt")) as file:
        text = file.read()

    parsed_pdf_file = os.path.basename(parsed_pdf_file)
    text += "\n" + "File Name: " + parsed_pdf_file + "\n"
    prompt = prompt.format(filename=parsed_pdf_file, text=text)

    with open("r_prompt.txt", "w") as file:
        file.write(prompt)

    # Initialize Watsonx model
    model = initialize_model(model_id)

    # Generate response
    resp = model.generate(prompt=prompt)["results"][0]["generated_text"]

    try:
        # Extract JSON from response
        first, last = resp.find("{"), resp.rfind("}")
        resp_json = json.loads(resp[first : last + 1], strict=False)

        # Extract information from JSON
        name = resp_json.get("name", "")
        email = resp_json.get("email", "")
        phone_number = resp_json.get("phone_number", "")
        current_organization = resp_json.get("current_organization", "")
        years_experience = resp_json.get("years_experience", "")
        skills = resp_json.get("skills", "")

        # Connect to SQLite database
        conn = sqlite3.connect(db_path)

        # Check if email already exists
        if check_email_exists(conn, email):
            # Perform update if email exists
            c = conn.cursor()
            c.execute(
                """UPDATE resumes 
                         SET name=?, phone_number=?, current_organization=?, 
                             years_experience=?, skills=?
                         WHERE email=?""",
                (
                    name,
                    phone_number,
                    current_organization,
                    years_experience,
                    skills,
                    email,
                ),
            )
        else:
            # Perform insert if email does not exist
            c = conn.cursor()
            c.execute(
                """INSERT INTO resumes 
                         (name, email, phone_number, current_organization, years_experience, skills) 
                         VALUES (?, ?, ?, ?, ?, ?)""",
                (
                    name,
                    email,
                    phone_number,
                    current_organization,
                    years_experience,
                    skills,
                ),
            )

        conn.commit()
        conn.close()

        inferred = {
            "name": name,
            "email": email,
            "phone_number": phone_number,
            "current_organization": current_organization,
            "years_experience": years_experience,
            "skills": skills,
        }

        return inferred

    except json.JSONDecodeError as e:
        print("Invalid JSON:", e)
        return {}


# Sample usage
pdf_path = "/Users/charan/VS_Code/EMEA/db-schenker-entity-extraction/src/entity_extraction/s-2.pdf"
model_id = "meta-llama/llama-3-70b-instruct"
resume_extraction_prompt_input = """[INST] You are an information extraction assistant. Your task is to extract the following information in JSON format:
- Name: "name"
- Email: "email"
- Phone Number: "phone_number"
- Current Organization: "current_organization"
- Years of Experience: "years_experience"
- Skills: "skills"

Use this syntax for your response:
{{
    "filename": {filename},
    "name": "...",
    "email": "...",
    "phone_number": "...",
    "current_organization": "...",
    "years_experience": "...",
    "skills": "..."
}}


Input: {text}

Output:"""

db_path = "resume_data.db"

# Extract information from the resume and insert or update into SQLite
inferred = resume_extraction_and_insert(
    pdf_path, model_id, resume_extraction_prompt_input, db_path
)
print("Inferred Information:", inferred)

In [None]:
import os
import json
import sqlite3
from pdf2image import convert_from_path
import pytesseract
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.utils.enums import (
    DecodingMethods,
)


# Function to extract text from PDF
def parse_pdf_pytesseract(pdf_path, page_number=0) -> str:
    images = convert_from_path(
        pdf_path, first_page=page_number + 1, last_page=page_number + 1
    )
    if not images:
        return "Failed to convert PDF to image"
    page_image = images[0]
    extracted_text = pytesseract.image_to_string(page_image)
    return extracted_text


# Function to initialize the model
def initialize_model(model_id):
    generate_params = {
        GenParams.DECODING_METHOD: DecodingMethods.GREEDY,
        GenParams.MAX_NEW_TOKENS: 1024,
    }
    model = Model(
        model_id=model_id,
        params=generate_params,
        credentials={"apikey": os.getenv("GA_API_KEY"), "url": os.getenv("GA_URL")},
        project_id=os.getenv("GA_PROJECT_ID"),
    )
    return model


# Function to check if email exists in SQLite database
def check_email_exists(conn, email):
    c = conn.cursor()
    c.execute("SELECT COUNT(*) FROM resumes WHERE email=?", (email,))
    result = c.fetchone()
    return result[0] > 0


# Function to insert or update resume information in SQLite
def resume_extraction_and_insert(parsed_pdf_file, model_id, prompt, db_path):
    # Extract text from PDF
    with open(parsed_pdf_file.replace(".pdf", ".txt"), "w") as file:
        file.write(parse_pdf_pytesseract(parsed_pdf_file, 0))

    with open(parsed_pdf_file.replace(".pdf", ".txt")) as file:
        text = file.read()

    parsed_pdf_file = os.path.basename(parsed_pdf_file)
    text += "\n" + "File Name: " + parsed_pdf_file + "\n"
    prompt = prompt.format(filename=parsed_pdf_file, text=text)

    with open("r_prompt.txt", "w") as file:
        file.write(prompt)

    # Initialize Watsonx model
    model = initialize_model(model_id)

    # Generate response
    resp = model.generate(prompt=prompt)["results"][0]["generated_text"]

    try:
        # Extract JSON from response
        first, last = resp.find("{"), resp.rfind("}")
        resp_json = json.loads(resp[first : last + 1], strict=False)

        # Extract information from JSON
        name = resp_json.get("name", "")
        email = resp_json.get("email", "")
        phone_number = resp_json.get("phone_number", "")
        current_organization = resp_json.get("current_organization", "")
        years_experience = resp_json.get("years_experience", "")
        skills = resp_json.get("skills", "")

        # Connect to SQLite database
        conn = sqlite3.connect(db_path)

        # Check if email already exists
        if check_email_exists(conn, email):
            # Perform update if email exists
            c = conn.cursor()
            c.execute(
                """UPDATE resumes 
                         SET name=?, phone_number=?, current_organization=?, 
                             years_experience=?, skills=?
                         WHERE email=?""",
                (
                    name,
                    phone_number,
                    current_organization,
                    years_experience,
                    skills,
                    email,
                ),
            )
        else:
            # Perform insert if email does not exist
            c = conn.cursor()
            c.execute(
                """INSERT INTO resumes 
                         (name, email, phone_number, current_organization, years_experience, skills) 
                         VALUES (?, ?, ?, ?, ?, ?)""",
                (
                    name,
                    email,
                    phone_number,
                    current_organization,
                    years_experience,
                    skills,
                ),
            )

        conn.commit()
        conn.close()

        inferred = {
            "name": name,
            "email": email,
            "phone_number": phone_number,
            "current_organization": current_organization,
            "years_experience": years_experience,
            "skills": skills,
        }

        return inferred

    except json.JSONDecodeError as e:
        print("Invalid JSON:", e)
        return {}


# Function to process multiple PDFs
def process_multiple_pdfs(pdf_paths, model_id, prompt, db_path):
    results = []
    for pdf_path in pdf_paths:
        inferred = resume_extraction_and_insert(pdf_path, model_id, prompt, db_path)
        results.append(inferred)
    return results


# Sample usage
pdf_paths = [
    "/Users/charan/VS_Code/EMEA/db-schenker-entity-extraction/src/entity_extraction/Charana_H_U_Resume_2024.pdf",
    "/Users/charan/VS_Code/EMEA/db-schenker-entity-extraction/src/entity_extraction/s-2.pdf",
]
model_id = "meta-llama/llama-3-70b-instruct"
resume_extraction_prompt_input = """[INST] You are an information extraction assistant. Your task is to extract the following information in JSON format:
- Name: "name"
- Email: "email"
- Phone Number: "phone_number"
- Current Organization: "current_organization"
- Years of Experience: "years_experience"
- Skills: "skills"

Use this syntax for your response:
{{
    "filename": {filename},
    "name": "...",
    "email": "...",
    "phone_number": "...",
    "current_organization": "...",
    "years_experience": "...",
    "skills": "..."
}}


Input: {text}

Output:"""

db_path = "resume_data.db"

# Process multiple PDFs and insert or update into SQLite
results = process_multiple_pdfs(
    pdf_paths, model_id, resume_extraction_prompt_input, db_path
)
print("Inferred Information for all PDFs:", results)

In [8]:
import os
import json
import sqlite3
from pdf2image import convert_from_path
import pytesseract
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.utils.enums import (
    DecodingMethods,
)


# Function to extract text from PDF
def parse_pdf_pytesseract(pdf_path, page_number=0) -> str:
    images = convert_from_path(
        pdf_path, first_page=page_number + 1, last_page=page_number + 1
    )
    if not images:
        return "Failed to convert PDF to image"
    page_image = images[0]
    extracted_text = pytesseract.image_to_string(page_image)
    return extracted_text


# Function to initialize the model
def initialize_model(model_id):
    generate_params = {
        GenParams.DECODING_METHOD: DecodingMethods.GREEDY,
        GenParams.MAX_NEW_TOKENS: 1024,
    }
    model = Model(
        model_id=model_id,
        params=generate_params,
        credentials={"apikey": os.getenv("GA_API_KEY"), "url": os.getenv("GA_URL")},
        project_id=os.getenv("GA_PROJECT_ID"),
    )
    return model


# Function to check if email exists in SQLite database
def check_email_exists(conn, email):
    c = conn.cursor()
    c.execute("SELECT COUNT(*) FROM resumes WHERE email=?", (email,))
    result = c.fetchone()
    return result[0] > 0


# Function to create resumes table if not exists
def create_resumes_table(conn):
    c = conn.cursor()
    c.execute(
        """CREATE TABLE IF NOT EXISTS resumes
                 (name TEXT, email TEXT PRIMARY KEY, phone_number TEXT,
                  current_organization TEXT, years_experience TEXT, skills TEXT)"""
    )
    conn.commit()


# Function to insert or update resume information in SQLite
def resume_extraction_and_insert(parsed_pdf_file, model_id, prompt, db_path):
    # Extract text from PDF
    with open(parsed_pdf_file.replace(".pdf", ".txt"), "w") as file:
        file.write(parse_pdf_pytesseract(parsed_pdf_file, 0))

    with open(parsed_pdf_file.replace(".pdf", ".txt")) as file:
        text = file.read()

    parsed_pdf_file = os.path.basename(parsed_pdf_file)
    text += "\n" + "File Name: " + parsed_pdf_file + "\n"
    prompt = prompt.format(filename=parsed_pdf_file, text=text)

    with open("r_prompt.txt", "w") as file:
        file.write(prompt)

    # Initialize Watsonx model
    model = initialize_model(model_id)

    # Generate response
    resp = model.generate(prompt=prompt)["results"][0]["generated_text"]

    try:
        # Extract JSON from response
        first, last = resp.find("{"), resp.rfind("}")
        resp_json = json.loads(resp[first : last + 1], strict=False)

        # Extract information from JSON
        name = resp_json.get("name", "")
        email = resp_json.get("email", "")
        phone_number = resp_json.get("phone_number", "")
        current_organization = resp_json.get("current_organization", "")
        years_experience = resp_json.get("years_experience", "")
        skills = resp_json.get("skills", "")

        # Connect to SQLite database
        conn = sqlite3.connect(db_path)

        # Ensure resumes table exists
        create_resumes_table(conn)

        # Check if email already exists
        if check_email_exists(conn, email):
            # Perform update if email exists
            c = conn.cursor()
            c.execute(
                """UPDATE resumes 
                         SET name=?, phone_number=?, current_organization=?, 
                             years_experience=?, skills=?
                         WHERE email=?""",
                (
                    name,
                    phone_number,
                    current_organization,
                    years_experience,
                    skills,
                    email,
                ),
            )
        else:
            # Perform insert if email does not exist
            c = conn.cursor()
            c.execute(
                """INSERT INTO resumes 
                         (name, email, phone_number, current_organization, years_experience, skills) 
                         VALUES (?, ?, ?, ?, ?, ?)""",
                (
                    name,
                    email,
                    phone_number,
                    current_organization,
                    years_experience,
                    skills,
                ),
            )

        conn.commit()
        conn.close()

        inferred = {
            "name": name,
            "email": email,
            "phone_number": phone_number,
            "current_organization": current_organization,
            "years_experience": years_experience,
            "skills": skills,
        }

        return inferred

    except json.JSONDecodeError as e:
        print("Invalid JSON:", e)
        return {}


# Function to process multiple PDFs
def process_multiple_pdfs(pdf_paths, model_id, prompt, db_path):
    results = []
    for pdf_path in pdf_paths:
        inferred = resume_extraction_and_insert(pdf_path, model_id, prompt, db_path)
        results.append(inferred)
    return results


# Sample usage
pdf_paths = [
    "/Users/charan/VS_Code/EMEA/db-schenker-entity-extraction/src/entity_extraction/Charana_H_U_Resume_2024.pdf",
    "/Users/charan/VS_Code/EMEA/db-schenker-entity-extraction/src/entity_extraction/s-2.pdf",
]
model_id = "meta-llama/llama-3-70b-instruct"
resume_extraction_prompt_input = """You are an information extraction assistant. Your task is to extract the following information in JSON format:
- Name: "name"
- Email: "email"
- Phone Number: "phone_number"
- Current Organization: "current_organization"
- Years of Experience: "years_experience"
- Skills: "skills"

Use this syntax for your response:
{{
    "filename": {filename},
    "name": "...",
    "email": "...",
    "phone_number": "...",
    "current_organization": "...",
    "years_experience": "...",
    "skills": "..."
}}


Input: {text}

Output:"""

db_path = "resume_data.db"

# Process multiple PDFs and insert or update into SQLite
results = process_multiple_pdfs(
    pdf_paths, model_id, resume_extraction_prompt_input, db_path
)
print("Inferred Information for all PDFs:", results)

Inferred Information for all PDFs: [{'name': 'Charana H U', 'email': 'charanhumail@gmail.com', 'phone_number': '+91 9481368353', 'current_organization': 'IBM (India Pvt. Ltd,)', 'years_experience': '2.5+ years', 'skills': 'Python, Langchain, Llama-Index, Django, Flask, ChromaDB, Milvus, Weaviate, MySQL, SQL Lite, Microsoft SQL Server, PyCharm, Visual Studio, VS Code, AWS Bedrock, AWS Sagemaker, Azure ML, Google Vertex Al, Watsonx.ai, NumPy, Pandas, Matplotlib, SciPy, Skit-learn, OpenCv, Tensor Flow, Keras, Pytorch, Prompt Engineering, LLM-Finetuning, Embedding Model Fine-tuning, OpenAl Whisper Speech to Text, Attention, Transformers, BERT, MLP, CNN, RNN, LSTM, Encoders and Decoders, Seq2Seq, GANs, YOLO, Docker, Git/GitHub, Elevator pitch, Stand and deliver, Public Speaking, Leadership, Teamwork and Presentation'}, {'name': 'Sohan M', 'email': 'sohanm10@gmail.com', 'phone_number': '8050636614', 'current_organization': 'IBM Bengaluru, India', 'years_experience': 'Not mentioned', 'skills'

In [9]:
import os
import json
import sqlite3
from pdf2image import convert_from_path
import pytesseract
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.utils.enums import (
    DecodingMethods,
)
from concurrent.futures import ThreadPoolExecutor, as_completed


# Function to extract text from PDF
def parse_pdf_pytesseract(pdf_path, page_number=0) -> str:
    images = convert_from_path(
        pdf_path, first_page=page_number + 1, last_page=page_number + 1
    )
    if not images:
        return "Failed to convert PDF to image"
    page_image = images[0]
    extracted_text = pytesseract.image_to_string(page_image)
    return extracted_text


# Function to initialize the model
def initialize_model(model_id):
    generate_params = {
        GenParams.DECODING_METHOD: DecodingMethods.GREEDY,
        GenParams.MAX_NEW_TOKENS: 1024,
    }
    model = Model(
        model_id=model_id,
        params=generate_params,
        credentials={"apikey": os.getenv("GA_API_KEY"), "url": os.getenv("GA_URL")},
        project_id=os.getenv("GA_PROJECT_ID"),
    )
    return model


# Function to check if email exists in SQLite database
def check_email_exists(conn, email):
    c = conn.cursor()
    c.execute("SELECT COUNT(*) FROM resumes WHERE email=?", (email,))
    result = c.fetchone()
    return result[0] > 0


# Function to create resumes table if not exists
def create_resumes_table(conn):
    c = conn.cursor()
    c.execute(
        """CREATE TABLE IF NOT EXISTS resumes
                 (name TEXT, email TEXT PRIMARY KEY, phone_number TEXT,
                  current_organization TEXT, years_experience TEXT, skills TEXT)"""
    )
    conn.commit()


# Function to insert or update resume information in SQLite
def resume_extraction_and_insert(parsed_pdf_file, model_id, prompt, db_path):
    # Extract text from PDF
    with open(parsed_pdf_file.replace(".pdf", ".txt"), "w") as file:
        file.write(parse_pdf_pytesseract(parsed_pdf_file, 0))

    with open(parsed_pdf_file.replace(".pdf", ".txt")) as file:
        text = file.read()

    parsed_pdf_file = os.path.basename(parsed_pdf_file)
    text += "\n" + "File Name: " + parsed_pdf_file + "\n"
    prompt = prompt.format(filename=parsed_pdf_file, text=text)

    with open("r_prompt.txt", "w") as file:
        file.write(prompt)

    # Initialize Watsonx model
    model = initialize_model(model_id)

    # Generate response
    resp = model.generate(prompt=prompt)["results"][0]["generated_text"]

    try:
        # Extract JSON from response
        first, last = resp.find("{"), resp.rfind("}")
        resp_json = json.loads(resp[first : last + 1], strict=False)

        # Extract information from JSON
        name = resp_json.get("name", "")
        email = resp_json.get("email", "")
        phone_number = resp_json.get("phone_number", "")
        current_organization = resp_json.get("current_organization", "")
        years_experience = resp_json.get("years_experience", "")
        skills = resp_json.get("skills", "")

        # Connect to SQLite database
        conn = sqlite3.connect(db_path)

        # Ensure resumes table exists
        create_resumes_table(conn)

        # Check if email already exists
        if check_email_exists(conn, email):
            # Perform update if email exists
            c = conn.cursor()
            c.execute(
                """UPDATE resumes 
                         SET name=?, phone_number=?, current_organization=?, 
                             years_experience=?, skills=?
                         WHERE email=?""",
                (
                    name,
                    phone_number,
                    current_organization,
                    years_experience,
                    skills,
                    email,
                ),
            )
        else:
            # Perform insert if email does not exist
            c = conn.cursor()
            c.execute(
                """INSERT INTO resumes 
                         (name, email, phone_number, current_organization, years_experience, skills) 
                         VALUES (?, ?, ?, ?, ?, ?)""",
                (
                    name,
                    email,
                    phone_number,
                    current_organization,
                    years_experience,
                    skills,
                ),
            )

        conn.commit()
        conn.close()

        inferred = {
            "name": name,
            "email": email,
            "phone_number": phone_number,
            "current_organization": current_organization,
            "years_experience": years_experience,
            "skills": skills,
        }

        return inferred

    except json.JSONDecodeError as e:
        print("Invalid JSON:", e)
        return {}


# Function to process multiple PDFs in parallel
def process_multiple_pdfs(pdf_paths, model_id, prompt, db_path):
    results = []
    with ThreadPoolExecutor(max_workers=None) as executor:
        futures = [
            executor.submit(
                resume_extraction_and_insert, pdf_path, model_id, prompt, db_path
            )
            for pdf_path in pdf_paths
        ]
        for future in as_completed(futures):
            results.append(future.result())
    return results


# Sample usage
pdf_paths = [
    "/Users/charan/VS_Code/EMEA/db-schenker-entity-extraction/src/entity_extraction/Charana_H_U_Resume_2024.pdf",
    "/Users/charan/VS_Code/EMEA/db-schenker-entity-extraction/src/entity_extraction/s-2.pdf",
]
model_id = "meta-llama/llama-3-70b-instruct"
resume_extraction_prompt_input = """[INST] You are an information extraction assistant. Your task is to extract the following information in JSON format:
- Name: "name"
- Email: "email"
- Phone Number: "phone_number"
- Current Organization: "current_organization"
- Years of Experience: "years_experience"
- Skills: "skills"

Use this syntax for your response:
{{
    "filename": {filename},
    "name": "...",
    "email": "...",
    "phone_number": "...",
    "current_organization": "...",
    "years_experience": "...",
    "skills": "..."
}}


Input: {text}

Output:"""

db_path = "resume_data.db"

# Process multiple PDFs in parallel and insert or update into SQLite
results = process_multiple_pdfs(
    pdf_paths, model_id, resume_extraction_prompt_input, db_path
)
print("Inferred Information for all PDFs:", results)

Inferred Information for all PDFs: [{'name': 'Sohan M', 'email': 'sohanm10@gmail.com', 'phone_number': '8050636614', 'current_organization': 'IBM Bengaluru, India', 'years_experience': 'Not mentioned', 'skills': 'Python, NumPy, Pandas, Matplotlib, Skit-learn, ChromaDB, Milvus, Weaviate, MySQL, LangChain, Llamalndex, TensorFlow, PyTorch, Streamlight, Flask, OpenAl, Google Gemini Pro, Naive Bayes, Logistic Regression, Linear Regression, SVMs, Decision Tree, Analytical thinking, Problem-solving, Teamwork, Effective communication'}, {'name': 'Charana H U', 'email': 'charanhumail@gmail.com', 'phone_number': '+91 9481368353', 'current_organization': 'IBM (India Pvt. Ltd,)', 'years_experience': '2.5+ years', 'skills': 'Python, Langchain, Llama-Index, Django, Flask, ChromaDB, Milvus, Weaviate, MySQL, SQL Lite, Microsoft SQL Server, PyCharm, Visual Studio, VS Code, AWS Bedrock, AWS Sagemaker, Azure ML, Google Vertex Al, Watsonx.ai, NumPy, Pandas, Matplotlib, SciPy, Skit-learn, OpenCv, Tensor Fl