In [None]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Load the data
data = pd.read_csv('Data/occupation_skills.csv')

In [None]:
""" Built an application that will take an occupation as input and return the top 10 skills required for that occupation. 
The application should also return the top 10 occupations that require the same skill. 
The application should be able to handle cases where the occupation or skill is not in the dataset.
THe application should be able to lookup the occupation and skill by their labels and not  by their URIs.
The application should be able to handle cases where the occupation or skill label is not an exact match to the label in the dataset,
but matches the altLabel in the dataset.
"""
def find_top_skills_for_occupation(occupation_input, data):
    # Normalize input and data for case-insensitive matching
    occupation_input_normalized = occupation_input.strip().lower()
    data['occupationLabel_normalized'] = data['occupationLabel'].str.lower()
    data['alt_occupationLabel_normalized'] = data['alt_occupationLabel'].str.lower().str.split('\n')
    
    # Attempt to match the input with both the occupationLabel and alt_occupationLabel
    # First, check direct matches in occupationLabel
    direct_matches = data[data['occupationLabel_normalized'] == occupation_input_normalized]
    
    # If no direct matches, check for matches in alt_occupationLabel
    if direct_matches.empty:
        matches = data[data['alt_occupationLabel_normalized'].apply(lambda x: occupation_input_normalized in x)]
    else:
        matches = direct_matches
    
    # If no matches found, return message indicating so
    if matches.empty:
        return f"No matches found for occupation: {occupation_input}"
    
    # Aggregate skills and their frequencies
    skills_frequency = matches['skillLabel'].value_counts().head(10)
    
    # Return the top 10 skills for the occupation
    return skills_frequency

def find_top_skills_for_occupation_partial_matches(occupation_input, data):
    occupation_input_normalized = occupation_input.lower().strip()

    # Checking for partial matches in 'occupationLabel' and each element of 'alt_occupationLabel_normalized'
    def matches_occupation(row):
        if occupation_input_normalized in row['occupationLabel_normalized']:
            return True
        return any(occupation_input_normalized in alt for alt in row['alt_occupationLabel_normalized'])

    filtered_data = data[data.apply(matches_occupation, axis=1)]

    if filtered_data.empty:
        return f"No matches found for occupation: {occupation_input}"

    skills_frequency = filtered_data['skillLabel'].value_counts().head(10)
    return skills_frequency



In [None]:
# Test Occupation for welder

test_occupation = "welder"


In [None]:
# Testing Functions
find_top_skills_for_occupation(test_occupation, data)

In [None]:
find_top_skills_for_occupation_partial_matches(test_occupation, data)