In [1]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Load the data
data = pd.read_csv('Data/occupation_skills.csv')

In [3]:
""" Built an application that will take an occupation as input and return the top 10 skills required for that occupation. 
The application should also return the top 10 occupations that require the same skill. 
The application should be able to handle cases where the occupation or skill is not in the dataset.
THe application should be able to lookup the occupation and skill by their labels and not  by their URIs.
The application should be able to handle cases where the occupation or skill label is not an exact match to the label in the dataset,
but matches the altLabel in the dataset.
"""
def find_top_skills_for_occupation(occupation_input, data):
    # Normalize input and data for case-insensitive matching
    occupation_input_normalized = occupation_input.strip().lower()
    data['occupationLabel_normalized'] = data['occupationLabel'].str.lower()
    data['alt_occupationLabel_normalized'] = data['alt_occupationLabel'].str.lower().str.split('\n')
    
    # Attempt to match the input with both the occupationLabel and alt_occupationLabel
    # First, check direct matches in occupationLabel
    direct_matches = data[data['occupationLabel_normalized'] == occupation_input_normalized]
    
    # If no direct matches, check for matches in alt_occupationLabel
    if direct_matches.empty:
        matches = data[data['alt_occupationLabel_normalized'].apply(lambda x: occupation_input_normalized in x)]
    else:
        matches = direct_matches
    
    # If no matches found, run the find_top_skills_for_occupation_partial_matches function
    if matches.empty:
        return find_top_skills_for_occupation_partial_matches(occupation_input, data)
    
    # Aggregate skills and their frequencies
    skills_frequency = matches['skillLabel'].value_counts().head(10)
    
    # Return the top 10 skills for the occupation
    return skills_frequency



def find_top_skills_for_occupation_partial_matches(occupation_input, dataset):
    # Local preprocessing within the function
    # Ensure 'alt_occupationLabel' is treated as a list of strings for each row
    temp_data = dataset.copy()
    temp_data['alt_occupationLabel'] = temp_data['alt_occupationLabel'].str.lower().str.split('\n').apply(lambda x: x if isinstance(x, list) else [])
    
    occupation_input_normalized = occupation_input.lower().strip()

    # Check for partial matches without altering the original dataset structure
    def matches_occupation(row):
        if occupation_input_normalized in row['occupationLabel'].lower():
            return True
        return any(occupation_input_normalized in alt for alt in row['alt_occupationLabel'])

    filtered_data = temp_data[temp_data.apply(matches_occupation, axis=1)]

    if filtered_data.empty:
        return f"No matches found for occupation: {occupation_input}"

    skills_frequency = filtered_data['skillLabel'].value_counts().head(10)
    return skills_frequency



In [4]:
# Testing Functions for welder
test_occupation = "welder"

find_top_skills_for_occupation(test_occupation, data)

skillLabel
cutting technologies                         1
manufacturing of steam generators            1
perform welding inspection                   1
apply preliminary treatment to workpieces    1
determine suitability of materials           1
assemble metal parts                         1
prepare pieces for joining                   1
apply soldering techniques                   1
operate brazing equipment                    1
operate automated process control            1
Name: count, dtype: int64

In [5]:
# Testing Functions for partial welder
test_occupation_partial = "weld"

find_top_skills_for_occupation_partial_matches(test_occupation_partial, data)


skillLabel
keep records of work progress       13
types of metal                      12
perform test run                    10
ferrous metal processing            10
quality standards                    9
spot metal imperfections             9
wear appropriate protective gear     9
ensure equipment availability        9
operate soldering equipment          9
consult technical resources          9
Name: count, dtype: int64