In [2]:
import sqlite3
import pandas as pd

In [4]:
db_name = "../pinecone/data/db.db"
table_name = "job_data"

conn = sqlite3.connect(db_name)

# Dump the SQL query into a DataFrame
df = pd.read_sql_query(f"""
    SELECT jobLink, jobTitle, jobCompany, minSalary, maxSalary, jobDetails, jobLocation, pullDate
    FROM {table_name}
""", conn)

# Close the connection
conn.close()

In [108]:
job_num = 10
cur_details = df['jobDetails'].iloc[job_num]
df['jobDetails'].iloc[job_num]

"Title: Data Engineer with Pyspark (Onsite)Cognizant (NASDAQ: CTSH) is a leading provider of information technology, consulting, and business process outsourcing services, dedicated to helping the world's leading companies build stronger businesses. Headquartered in Teaneck, New Jersey (U.S.). Cognizant is a member of the NASDAQ-100, the S&P 500, the Forbes Global 1000, and the Fortune 500 and we are among the top performing and fastest growing companies in the world.Practice - AIA - Artificial Intelligence and AnalyticsAbout AI & Analytics: Artificial intelligence (AI) and the data it collects and analyzes will soon sit at the core of all intelligent, human-centric businesses. By decoding customer needs, preferences, and behaviors, our clients can understand exactly what services, products, and experiences their consumers need. Within AI & Analytics, we work to design the future - a future in which trial-and-error business decisions have been replaced by informed choices and data-supp

In [109]:
import re

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = text.strip()  # Trim leading and trailing spaces
    return text

def extract_job_type(text):
    job_types = ["full[- ]?time", "part[- ]?time", "contract", "temporary", "internship", "freelance"]
    pattern = re.compile("|".join(job_types), re.IGNORECASE)
    match = pattern.search(text)
    return match.group(0).capitalize() if match else "Not specified"

def is_remote(text):
    remote_keywords = ["remote", "work from home", "telecommute", "virtual"]
    pattern = re.compile("|".join(remote_keywords), re.IGNORECASE)
    return bool(pattern.search(text))

def extract_max_years_experience(job_details):
    """
    Extracts the maximum years of experience from a job description.

    Parameters:
    job_details (str): The job details text blob.

    Returns:
    str: Maximum years of experience or 'Not specified' if not found.
    """
    # Define regex patterns to capture years of experience
    patterns = [
        r'(\d{1,2})\+?\s*(?:years|yrs)\s*of\s*experience',  # e.g., '5 years of experience', '3+ years'
        r'(\d{1,2})-(\d{1,2})\s*(?:years|yrs)',             # e.g., '3-5 years'
        r'at least\s*(\d{1,2})\s*(?:years|yrs)'             # e.g., 'at least 2 years'
    ]
    
    # List to collect all found years of experience
    years = []

    # Loop through patterns and find all matches
    for pattern in patterns:
        matches = re.findall(pattern, job_details, re.IGNORECASE)
        for match in matches:
            # If it's a tuple (from a range), take the max of the range
            if isinstance(match, tuple):
                years.extend(map(int, match))
            else:
                years.append(int(match))

    # Return the max years found or 'Not specified' if the list is empty
    return max(years) if years else 'Not specified'

clean_details = preprocess_text(cur_details)
job_type = extract_job_type(clean_details)
is_remote = is_remote(clean_details)
yrs_exp = extract_max_years_experience(clean_details)

print("Job Type:", job_type)
print("Is Remote:", is_remote)
print("Yrs Exp:", yrs_exp)

Job Type: Not specified
Is Remote: False
Yrs Exp: Not specified


In [142]:
from langchain_ollama import OllamaLLM
import json

def create_prompt(job_details):
    prompt = f"""
    You are a helpful assistant that extracts structured information from job postings. Below is a job posting from Indeed:

    {job_details}

    Extract the following information in JSON format:
    - "job_description": The full text of the job description, including all responsibilities and skills needed as stated.
    - "requirements": A list of job requirements exactly as listed in the posting.
    - "company_description": The full text of the company description (if available).

    Maintain the wording and details as much as possible as they are presented. You can summarize lightly.

    Return the output in the following JSON format:
    {{
        "job_description": "...",
        "requirements": ["...", "..."],
        "company_description": "..."
    }}

    Do not return anything else but the data in JSON format.
    """
    return prompt

def ask_ollama(prompt):
    model = OllamaLLM(model="llama3.2")
    response = model.invoke(prompt)
    return response

def parse_llm_response(response):
    try:
        return json.loads(response)
    except json.JSONDecodeError:
        return None

def extract_job_details(job_details):
    # Preprocess the job details
    job_details = preprocess_text(job_details)
    
    # Create the prompt for Ollama
    prompt = create_prompt(job_details)
    
    # Ask Ollama to extract the information
    response = ask_ollama(prompt)
    
    # Parse the response into a dictionary
    structured_data = parse_llm_response(response)
    
    return structured_data

extracted_sections = extract_job_details(clean_details)

In [144]:
print("Job Description:",extracted_sections.get("job_description",""))
print("*"*20)
print("requirements:",extracted_sections.get("requirements",""))
print("*"*20)
print("Company Description:",extracted_sections.get("company_description",""))

Job Description: Looking for Data Engineer with Pyspark and AWS skillsResponsibilities1) Requirement understanding and gathering2) Design and develop data pipelines for new module3) Unit testing and help UAT team for any issues4) Upload all the deliverables in Git and help prod team to deploy the code in prodDesign and implement robust data transformation pipelines using pysparkDevelop and maintain a scalable data pipeline architecture that supports a wide variety of data sources and business use cases.Collaborate with data engineers and business stakeholders to define requirements for data transformations and models.Lead best practices for pipeline development including modularity testing version control and continuous integration (CI/CD).Optimize data pipeline models for performance and scalability in a cloud-based data warehouse environment (AWS S3 Redshift).Ensure data quality and governance through well-defined data transformation processes testing frameworks and documentation.Tro