## Importing libraries

In [None]:
import logging
import time
import pandas as pd
import google.generativeai as genai
import os

## Loading Gemini API key

https://ai.google.dev/gemini-api/docs/api-key

In [None]:
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

## Logging to handle API errors

In [None]:
def format_error(error: str) -> str:
    """
    Formats an error message by removing newlines and adding context about the error type.

    Parameters:
    error (str): The error message to format.

    Returns:
    str: A formatted error string that includes the error type and the cleaned error message.
    """
    # Capture the error type as a string
    error_type = str(type(error))
    
    # Remove newline characters from the error message
    error = str(error).replace('\n', '')
    
    # Combine the error type and cleaned error message into a single formatted string
    formatted_error = f'Error Type: {error_type}, Error: {error}'

    return formatted_error

In [None]:
# Initialize logging for the module
logging.basicConfig(level=logging.INFO)
LOGGER = logging.getLogger(__name__)

## Skills and salary extraction

In [None]:
def extract_ner(idx, job_description):
    """
    Extracts Named Entity Recognition (NER) data, including skills and salary range, from a job description using the Gemini model.

    Parameters:
    idx (int): The index of the current job description in the dataset.
    job_description (str): The job description text from which to extract NER data.

    Returns:
    tuple: A tuple containing the index, a comma-separated string of extracted skills, and the salary message.
    """
    
    LOGGER.info(f'Working on index {idx + 1}')

    # Initialize the Gemini model
    model = genai.GenerativeModel(model_name='models/gemini-pro')
    
    # Prepare the input message for extracting skills
    input_message = (
        "USER: Imagine you're an NER AI model. Your task is to extract technical skills, "
        "frameworks, languages, software, and concepts from the given job description. "
        "Be concise and focus only on the relevant skills required for the candidate. "
        "Do not include outcomes or results of using these skills. "
        "Standardize the names of skills and software to ensure clarity and meaningfulness. "
        "You can remove or add text to do so. Provide a single list of required skills. "
        "List 1-3 word entities only, without sentences. Format your response as a list.\n"
        f"USER: Here is the description: {{job_description}}"
    )

    retries = 0
    gemini_message = None
    
    # Retry loop to handle potential API rate limits or other transient errors
    while True:
        try:
            # Generate content using the Gemini model
            gemini_response = model.generate_content(input_message)
            
            # Check if valid content was returned
            if gemini_response.candidates and gemini_response.candidates[0].content.parts:
                gemini_message = ''.join(part.text for part in gemini_response.candidates[0].content.parts)
                break
            else:
                LOGGER.error(f'No valid content in response for {idx + 1}.')
                return None
        except Exception as error:
            # Handle rate limiting by waiting and retrying
            if '429' in str(error):
                LOGGER.warning(f'Rate limit on {idx + 1}. Sleeping 8 seconds')
                time.sleep(8)
                retries += 1
                continue
            
            # Log other errors and exit the loop
            LOGGER.error(f'{format_error(error)} on {idx + 1} via model.generate_content.')
            break    

    # Handle invalid or missing gemini_message
    if not gemini_message or not isinstance(gemini_message, str):
        LOGGER.error(f'Invalid content type for gemini_message on {idx + 1}.')
        return None
    
    # Process the extracted skills into a cleaned list
    skills = gemini_message.strip().split('\n')
    extracted_skills = []
    for skill in skills:
        if len(skill) > 3:  # Filter out short or empty skills
            skill = skill.replace('-', '').replace(',', '').strip()
            extracted_skills.append(skill)

    # Convert the list of skills into a comma-separated string
    extracted_skills_str = ', '.join(extracted_skills)

    # Prepare the input message for extracting the salary range
    salary_input_message = (
        "USER: Check if the salary range is mentioned in the job description. If it is mentioned and already specified per year, return [yearly, minimum range, maximum range]. "
        "If the rate is hourly, convert it to yearly by multiplying by 2080 and return [hourly, minimum range, maximum range], with the ranges converted to yearly values. "
        "If the rate is monthly, convert it to yearly by multiplying by 12 and return [monthly, minimum range, maximum range], with the ranges converted to yearly values. "
        "If only the minimum range is provided, repeat the value for the maximum range. If only the maximum range is provided, consider the salary as not provided and return 'not specified'. "
        "Consider different terms, such as salary, compensation, pay, in the context of the description. Convert minimum range and maximum range to numbers. For instance, if the value is $10k, convert it to 10000. "
        "If more than one range is provided, return the first one. Include a salary range even if the compensation value is location-specific. If no salary range is specified, return exactly 'not specified'. "
        f"USER: Here is the posting: {{job_description}}"
    )

    retries = 0
    salary_message = None
    
    # Retry loop for extracting the salary range, similar to the skills extraction
    while retries < 20:
        try:
            # Generate content using the Gemini model
            salary_response = model.generate_content(salary_input_message)
            salary_message = salary_response.text.strip()
            time.sleep(1)
            break
        except Exception as error:
            # Handle rate limiting by waiting and retrying
            if '429' in str(error):
                LOGGER.warning(f'Rate limit on {idx + 1}. Sleeping 8 seconds')
                time.sleep(8)
                retries += 1
                continue
                
            # Log other errors and exit the loop
            LOGGER.error(f'{format_error(error)} on {idx + 1} via model.generate_content.')
            break    

    # Default to 'not specified' if no salary range is found
    if not salary_message:
        salary_message = 'not specified'

    # Return the index, extracted skills, and salary message
    return (idx, extracted_skills_str, salary_message)

### Skills and salary extraction - execution

In [None]:
def apply_ner_to_dataframe(data):
    idx, row = data
    result = extract_ner(idx, row['job description'])

    return result

In [None]:
job_postings = pd.read_csv('./full_postings.csv')
job_postings = job_postings.drop_duplicates(['Title', 'Company', 'levelMapping'])
job_postings = job_postings.reset_index()

In [None]:
results = []
for data in job_postings.loc[job_postings['salary_range'] == "'not specified'"].iterrows():
    results.append(apply_ner_to_dataframe(data))
    
# Convert the results to a DataFrame
results_df = pd.DataFrame(results, columns=['index', 'extracted_skills', 'salary_range'])

# Merge the results with the original job_postings DataFrame
merged_df = job_postings.merge(results_df, left_index=True, right_on='index', how='right').drop('index', axis = 1)

## Saving file

In [None]:
# Save the merged DataFrame to a CSV file
merged_df.to_csv(f'./job_postings.csv', index=False)