In [13]:
import json
import random
import os
import argparse

import pandas as pd
import numpy as np

from collections import defaultdict
import yaml
from IPython.display import display

In [2]:
print(os.listdir(".."))
print(os.getcwd())
print(os.path.exists("Code/config/run.yaml"))

['.gitignore', 'config', 'Experiment', 'jcrec', 'requirements.txt', 'results']
C:\Users\minhm\OneDrive - Université Côte d'Azur\I3S_CNRS_Internship\Project\Code\Experiment
False


In [90]:
config_path = "../config/run.yaml"

with open(config_path, "r") as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

In [91]:
path = os.path.join("../../", config["taxonomy_path"]) #test path
print(path)

../../Data - Collection/Final/taxonomy.csv


In [147]:
def load_skills(config):
        """
        Loads skills from a taxonomy file into the instance, processes them based on configuration,
        and creates a mapping of skills to integer indices.

        The method reads a CSV file specified in the configuration and processes the skills
        either by extracting unique values from the 'Type Level 3' column (if level_3 is True)
        or using the 'unique_id' column (if level_3 is False). It populates `skills` with
        a set of skills and `skills2int` with a dictionary mapping skills to integer indices.

        Attributes Modified:
            skills (set): A set of unique skill identifiers or level 3 types.
            skills2int (dict): A dictionary mapping skill identifiers to integer indices.
        """
        # load the skills from the taxonomy file
        skills = pd.read_csv(os.path.join("../../", config["taxonomy_path"]))
        display(skills.head(3))
        print("\n")
        print(skills.columns)
        print(f"Total skills :{len(skills)}")
        # if level_3 is true, we only use the level 3 of the skill taxonomy, then we need to get the unique values in column Type Level 3
        ## Note: A single taxonomy skill may be shared across multiple skills. Using Level 3 taxonomy is preferred
        # as it maintains effective skill categorization. Levels 1 or 2 are too broad, resulting in overly general domains.
        if config["level_3"]:
            # get all the unique values in column Type Level 3
            level2int = {
                level: i for i, level in enumerate(skills["Type Level 3"].unique())
            }

            # make a dict from column unique_id to column Type Level 3
            skills_dict = dict(
                zip(skills["unique_id"], skills["Type Level 3"])
            )

            # map skills_dict values to level2int
            skills2int = {
                key: level2int[value] for key, value in skills_dict.items()
            }
            skills = set(skills2int.values())
            print(f"total taxonomy skills :",len(skills), "\n")
            #print(level2int) #output : software and applications development and analysis : 0
            #print(skills_dict) #output : 1000: software and applications development and analysis
            #print(skills2int) #output : 1000: 0
            return skills,skills2int
        # if level_3 is false, we use the unique_id column as the skills ~ no taxonomy levels
        else:
            skills = set(skills["unique_id"])
            skills2int = {skill: i for i, skill in enumerate(skills)}
            #print(len(skills)) #output : 1794
            #print(len(skills2int)) #output : 1794
            return skills,skills2int

In [149]:
skills,skills2int = load_skills(config=config)

Unnamed: 0.1,Unnamed: 0,Source,Type Level 4,altLabels,Definition,Dimension,Type Level 1,Type Level 2,Type Level 3,unique_id,name,name+definition
0,0,http://data.europa.eu/esco/skill/000f1d3d-220f...,Haskell,Haskell,The techniques and principles of software deve...,knowledge,information and communication technologies (icts),information and communication technologies (icts),software and applications development and anal...,1000,Haskell,Haskell : the techniques and principles of sof...
1,2,http://data.europa.eu/esco/skill/0037c821-2898...,develop energy saving concepts,developing concepts for energy saving\r\ndevel...,Use current research results and collaborate w...,skills,management skills,developing objectives and strategies,developing operational policies and procedures,1001,develop energy saving concepts,develop energy saving concepts : use current r...
2,4,http://data.europa.eu/esco/skill/0058526a-11e9...,conduct research on flora,carry out research on flora\r\nflora research\...,Collect and analyse data about plants in order...,skills,information skills,analysing and evaluating information and data,analysing scientific and medical data,1002,conduct research on flora,conduct research on flora : collect and analys...




Index(['Unnamed: 0', 'Source', 'Type Level 4', 'altLabels', 'Definition',
       'Dimension', 'Type Level 1', 'Type Level 2', 'Type Level 3',
       'unique_id', 'name', 'name+definition'],
      dtype='object')
Total skills :1794
total taxonomy skills : 46 



In [106]:
def load_mastery_levels(config):
        """Load the mastery levels from the file specified in the config and store it in the class attribute"""
        mastery_levels = json.load(open(os.path.join("../../", config["mastery_levels_path"])))
        print(mastery_levels)
        return mastery_levels

In [134]:
mastery_levels = load_mastery_levels(config)

{'beginner': 1, 'intermediate': 2, 'expert': 3, 'unknown': -1}


In [157]:
def get_avg_skills(skill_list,skills2int,mastery_levels, replace_unk):
        avg_skills = defaultdict(list)
        for skill, mastery_level in skill_list:
            # if the mastery level is a string and is in the mastery levels, we replace it with the corresponding value, otherwise we do nothing and continue to the next skill
            if isinstance(mastery_level, str) and mastery_level in mastery_levels:
                mastery_level = mastery_levels[mastery_level]
                if mastery_level == -1:
                    mastery_level = replace_unk
                skill = skills2int[skill]
                avg_skills[skill].append(mastery_level)
        # we take the average of the mastery levels for each skill because on our dataset we can have multiple mastery levels for the same skill
        for skill in avg_skills.keys():
            avg_skills[skill] = sum(avg_skills[skill]) / len(avg_skills[skill])
            avg_skills[skill] = round(avg_skills[skill])

        return avg_skills

In [158]:
def load_learners(config, skills,mastery_levels,replace_unk=1):
        """Load the learners from the file specified in the config and store it in the class attribute

        Args:
            replace_unk (int, optional): The value to replace the unknown mastery levels. Defaults to 1.
        """
        learners = json.load(open(os.path.join("../../", config["cv_path"])))
        #display(learners)  #output:'36856210': [[1024, 'expert'],[2017, 'expert'],...
        max_learner_skills = config["max_cv_skills"]
        learners_index = dict()

        # numpy array to store the learners skill proficiency levels with default value 0
        learners_array = np.zeros((len(learners), len(skills)), dtype=int)
        index = 0

        # fill the numpy array with the learners skill proficiency levels from the json file
        for learner_id, learner in learners.items():

            avg_learner = get_avg_skills(learner,skills2int,mastery_levels, replace_unk)

            # if the number of skills is greater than the max_learner_skills, we skip the learner
            if len(avg_learner) > max_learner_skills:
                continue

            # we fill the numpy array with the averaged mastery levels
            for skill, level in avg_learner.items():
                learners_array[index][skill] = level

            learners_index[index] = learner_id
            learners_index[learner_id] = index

            index += 1

        # we update the learners numpy array with the correct number of rows
        earners_array = learners_array[:index]

In [159]:
load_learners(config,skills,mastery_levels)

TypeError: argument of type 'NoneType' is not iterable

In [112]:
print(skills)

NameError: name 'skills' is not defined