In [1]:
# Import Libraries

import numpy as np
import pandas as pd


# Ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [4]:
# Define OccupationSkillAnalyzer class that will be used to analyze the dataset
class OccupationSkillAnalyzer:
    def __init__(self):
        # Hardwire the dataset path here
        dataset_path = 'Data/occupation_skills.csv'
        self.data = pd.read_csv(dataset_path)
        # Normalize columns for case-insensitive searching
        self.data['occupationLabel_normalized'] = self.data['occupationLabel'].str.lower()
        self.data['alt_occupationLabel_normalized'] = self.data['alt_occupationLabel'].str.lower().str.split('\n').apply(lambda x: x if isinstance(x, list) else [])
    
    def analyze_input(self, input_string):
        normalized_input = input_string.strip().lower()
        return self._find_top_skills_for_occupation(normalized_input)
    
    def _find_top_skills_for_occupation(self, occupation_input):
        direct_matches = self.data[self.data['occupationLabel_normalized'] == occupation_input]
        
        if direct_matches.empty:
            matches = self.data[self.data['alt_occupationLabel_normalized'].apply(lambda x: occupation_input in x if isinstance(x, list) else False)]
        else:
            matches = direct_matches
            
        if matches.empty:
            # If no direct matches, try partial matches
            return self._find_top_skills_for_occupation_partial_matches(occupation_input)
        
        skills_frequency = matches['skillLabel'].value_counts().head(10)
        return skills_frequency if not skills_frequency.empty else self._find_top_skills_for_occupation_partial_matches(occupation_input)
    
    def _find_top_skills_for_occupation_partial_matches(self, occupation_input, min_length=4):
        def matches_occupation(row):
            if occupation_input in row['occupationLabel_normalized']:
                return True
            return any(occupation_input in alt for alt in row['alt_occupationLabel_normalized'])

        filtered_data = self.data[self.data.apply(matches_occupation, axis=1)]
        
        while filtered_data.empty and len(occupation_input) > min_length:
            shortened_input = occupation_input[:-1]
            if shortened_input:
                return self._find_top_skills_for_occupation_partial_matches(shortened_input, min_length)
        else:
            skills_frequency = filtered_data['skillLabel'].value_counts().head(10)
            return skills_frequency if not skills_frequency.empty else self._find_occupations_for_skill(occupation_input)
        
        return self._find_occupations_for_skill(occupation_input)

    def _find_occupations_for_skill(self, skill_input):
        temp_data = self.data.copy()
        skill_input_normalized = skill_input.lower().strip()
        temp_data['skillLabel_normalized'] = temp_data['skillLabel'].str.lower()
        temp_data['alt_skillLabel_normalized'] = temp_data['alt_skillLabel'].apply(lambda x: x.lower().split('\n') if isinstance(x, str) else [])
        direct_matches = temp_data[temp_data['skillLabel_normalized'] == skill_input_normalized]
        if direct_matches.empty:
            matches = temp_data[temp_data['alt_skillLabel_normalized'].apply(lambda x: skill_input_normalized in x)]
        else:
            matches = direct_matches
        if matches.empty:
            matches = temp_data[
                temp_data['skillLabel_normalized'].str.contains(skill_input_normalized) |
                temp_data['alt_skillLabel_normalized'].apply(lambda x: any(skill_input_normalized in alt for alt in x))
            ]
        if matches.empty:
            return "No match found."
        
        occupations_frequency = matches['occupationLabel'].value_counts().head(10)
        return occupations_frequency if not occupations_frequency.empty else "No match found."




In [5]:
# Example usage
analyzer = OccupationSkillAnalyzer()
result = analyzer.analyze_input('tradesman helper')
print(result)

skillLabel
public relations                             1
study topics                                 1
use databases                                1
identify research topics                     1
develop communications strategies            1
liaise with sports organisations             1
liaise with local authorities                1
advise legislators                           1
evaluate humanitarian programme proposals    1
promote equality in sport activities         1
Name: count, dtype: int64


In [None]:
# Load test data
test_data = pd.read_csv('Test_occupations.csv')

In [None]:
# Search for top skills for each job_title in the test data one by one and print results
for i, row in test_data.iterrows():
    job_title = row['job_title']
    result = analyzer.analyze_input(job_title)
    print(f"Top skills for {job_title}:")
    print(result)
    print("\n")

    

Top skills for Admin:
skillLabel
manage budgets                          31
manage staff                            25
use different communication channels    21
financial management                    19
manage accounts                         19
liaise with managers                    18
corporate social responsibility         18
handle financial transactions           17
follow company standards                16
create solutions to problems            15
Name: count, dtype: int64


Top skills for Welder:
skillLabel
cutting technologies                         1
manufacturing of steam generators            1
perform welding inspection                   1
apply preliminary treatment to workpieces    1
determine suitability of materials           1
assemble metal parts                         1
prepare pieces for joining                   1
apply soldering techniques                   1
operate brazing equipment                    1
operate automated process control            1
Name: