## Importing libraries

In [23]:
import numpy as np
import pandas as pd 

## Reading concatenated datafile with skills

In [24]:
df = pd.read_csv('job_postings.csv')

## Formatting compensation info

In [25]:
def split_salary_range(row):
    """
    Splits a salary range string into its components: compensation period, minimum range, and maximum range.

    Parameters:
    row (str): A string representing the salary range in the format '[compensation_period, min_range, max_range]'.
               If the row is 'not specified', it returns NaN for all components.

    Returns:
    pd.Series: A pandas Series containing the following fields:
        - 'compensation_period' (str or NaN): The period for which the compensation applies (e.g., 'yearly', 'monthly').
        - 'min_range' (float or NaN): The minimum salary value in the range.
        - 'max_range' (float or NaN): The maximum salary value in the range.
    """
    if row == "not specified":
        # If the salary is not specified, return NaN for all components
        return pd.Series({'compensation_period': np.nan, 'min_range': np.nan, 'max_range': np.nan})
    
    try:
        # Clean the string by removing brackets and splitting by commas
        row = row.strip("[]").split(", ")
        
        # Check if the resulting list has the expected three components
        if len(row) == 3:
            compensation_period = row[0].strip()
            min_range = float(row[1].strip())
            max_range = float(row[2].strip())
            # Return the components as a pandas Series
            return pd.Series({'compensation_period': compensation_period, 'min_range': min_range, 'max_range': max_range})
    
    except Exception as e:
        # Handle any errors that occur during parsing and conversion
        print("Error:", e)
    
    # If parsing fails, return NaN for all components
    return pd.Series({'compensation_period': np.nan, 'min_range': np.nan, 'max_range': np.nan})


### Formatting compensation info - execution

In [None]:
# Apply the function to the DataFrame
df[['compensation_period', 'min_range', 'max_range']] = df['salary_range'].apply(split_salary_range)
df['ref_salary'] = df[['min_range', 'max_range']].mean(axis=1)

## Saving skills data to text file (to be loaded on ChatGPT)

In [28]:
unique_skills = set()
for skills in df['extracted_skills']:
    skills_list = [skill for skill in skills.split(',')]
    unique_skills.update(skills_list)

# Convert the set to a sorted list
unique_skills_list = sorted(unique_skills)

with open('unique_skills_list.txt', 'w') as file:
    for skill in unique_skills_list:
        file.write(skill + '\n')

## Updating skill list with new file returned from ChatGPT

In [30]:
def read_skills_from_file(file_path):
    """
    Reads a list of skills from a text file, one skill per line.

    Parameters:
    file_path (str): The path to the file containing the skills.

    Returns:
    list: A list of skills where each skill is stripped of leading and trailing whitespace.
    """
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        skills = file.readlines()
    # Return the list of skills, stripped of any extra whitespace
    return [skill.strip() for skill in skills]

In [31]:
# Path to the file containing standardized skills
standardized_skills_file_path = 'standardized_skills_list.txt'

# Read and process the standardized skills from the file
standardized_skills_list = read_skills_from_file(standardized_skills_file_path)

# Create a dictionary for mapping unique skills to standardized skills
skills_mapping = dict(zip(unique_skills_list, standardized_skills_list))

In [32]:
def update_skills(skills, mapping):
    """
    Updates a comma-separated list of skills using a provided mapping dictionary.

    Parameters:
    skills (str): A comma-separated string of skills.
    mapping (dict): A dictionary mapping original skills to standardized skills.

    Returns:
    str: A comma-separated string of updated (standardized) skills.
    """
    # Split the skills string into a list and strip any extra whitespace
    skills_list = [skill.strip() for skill in skills.split(',')]
    # Map each skill to its standardized form, or keep it unchanged if not in the mapping
    updated_skills = [mapping.get(skill, skill) for skill in skills_list]
    # Return the updated skills as a comma-separated string
    return ', '.join(updated_skills)

In [33]:
# Apply the skill update function to the 'extracted_skills' column in the DataFrame
df['extracted_skills'] = df['extracted_skills'].apply(update_skills, args=(skills_mapping,))

In [None]:
df = df[~df.isna().any(axis = 1)].shape[0]

## Saving file

In [34]:
df.to_csv('job_postings_updated.csv', index = False)