In [1]:
import json
import random
import os
import argparse

import pandas as pd
import numpy as np

from collections import defaultdict
import yaml
from IPython.display import display

In [2]:
print(os.listdir(".."))
print(os.getcwd())
print(os.path.exists("Code/config/run.yaml"))

['.gitignore', 'config', 'Experiment', 'jcrec', 'requirements.txt', 'results']
C:\Users\minhm\OneDrive - Université Côte d'Azur\I3S_CNRS_Internship\Project\Code\Experiment
False


In [3]:
config_path = "../config/run.yaml"

with open(config_path, "r") as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

In [4]:
path = os.path.join("../../", config["taxonomy_path"]) #test path
print(path)

../../Data - Collection/Final/taxonomy.csv


In [5]:
def load_skills(config):
        """
        Loads skills from a taxonomy file into the instance, processes them based on configuration,
        and creates a mapping of skills to integer indices.

        The method reads a CSV file specified in the configuration and processes the skills
        either by extracting unique values from the 'Type Level 3' column (if level_3 is True)
        or using the 'unique_id' column (if level_3 is False). It populates `skills` with
        a set of skills and `skills2int` with a dictionary mapping skills to integer indices.

        Attributes Modified:
            skills (set): A set of unique skill identifiers or level 3 types.
            skills2int (dict): A dictionary mapping skill identifiers to integer indices.
        """
        # load the skills from the taxonomy file
        skills = pd.read_csv(os.path.join("../../", config["taxonomy_path"]))
        display(skills.head(3))
        print("\n")
        print(skills.columns)
        print(f"Total skills :{len(skills)}")
        # if level_3 is true, we only use the level 3 of the skill taxonomy, then we need to get the unique values in column Type Level 3
        ## Note: A single taxonomy skill may be shared across multiple skills. Using Level 3 taxonomy is preferred
        # as it maintains effective skill categorization. Levels 1 or 2 are too broad, resulting in overly general domains.
        if config["level_3"]:
            # get all the unique values in column Type Level 3
            level2int = {
                level: i for i, level in enumerate(skills["Type Level 3"].unique())
            }

            # make a dict from column unique_id to column Type Level 3
            skills_dict = dict(
                zip(skills["unique_id"], skills["Type Level 3"])
            )

            # map skills_dict values to level2int
            skills2int = {
                key: level2int[value] for key, value in skills_dict.items()
            }
            skills = set(skills2int.values())
            print(f"total taxonomy skills :",len(skills), "\n")
            #print(level2int) #output : software and applications development and analysis : 0
            #print(skills_dict) #output : 1000: software and applications development and analysis
            #print(skills2int) #output : 1000: 0
            return skills,skills2int
        # if level_3 is false, we use the unique_id column as the skills ~ no taxonomy levels
        else:
            skills = set(skills["unique_id"])
            skills2int = {skill: i for i, skill in enumerate(skills)}
            #print(len(skills)) #output : 1794
            #print(len(skills2int)) #output : 1794
            return skills,skills2int

In [6]:
skills,skills2int = load_skills(config=config)

Unnamed: 0.1,Unnamed: 0,Source,Type Level 4,altLabels,Definition,Dimension,Type Level 1,Type Level 2,Type Level 3,unique_id,name,name+definition
0,0,http://data.europa.eu/esco/skill/000f1d3d-220f...,Haskell,Haskell,The techniques and principles of software deve...,knowledge,information and communication technologies (icts),information and communication technologies (icts),software and applications development and anal...,1000,Haskell,Haskell : the techniques and principles of sof...
1,2,http://data.europa.eu/esco/skill/0037c821-2898...,develop energy saving concepts,developing concepts for energy saving\r\ndevel...,Use current research results and collaborate w...,skills,management skills,developing objectives and strategies,developing operational policies and procedures,1001,develop energy saving concepts,develop energy saving concepts : use current r...
2,4,http://data.europa.eu/esco/skill/0058526a-11e9...,conduct research on flora,carry out research on flora\r\nflora research\...,Collect and analyse data about plants in order...,skills,information skills,analysing and evaluating information and data,analysing scientific and medical data,1002,conduct research on flora,conduct research on flora : collect and analys...




Index(['Unnamed: 0', 'Source', 'Type Level 4', 'altLabels', 'Definition',
       'Dimension', 'Type Level 1', 'Type Level 2', 'Type Level 3',
       'unique_id', 'name', 'name+definition'],
      dtype='object')
Total skills :1794
total taxonomy skills : 46 



In [7]:
def load_mastery_levels(config):
        """Load the mastery levels from the file specified in the config and store it in the class attribute"""
        mastery_levels = json.load(open(os.path.join("../../", config["mastery_levels_path"])))
        print(mastery_levels)
        return mastery_levels

In [8]:
mastery_levels = load_mastery_levels(config)

{'beginner': 1, 'intermediate': 2, 'expert': 3, 'unknown': -1}


In [9]:
def get_avg_skills(skill_list,skills2int,mastery_levels, replace_unk):
        """
    Calculates the average mastery level for each skill in the provided skill list.

        This function processes a list of skills, each with a corresponding mastery level, 
        and computes the average mastery level for each skill. The mastery levels are 
        adjusted if they are in string format and match a predefined set of mastery levels. 
        If a mastery level is not recognized, it is replaced with the specified `replace_unk` value. 

        The function assumes that a single skill may have multiple mastery levels associated with it, 
        and it calculates the average for each skill. The result is a dictionary where the key is 
        the integer index of the skill, and the value is the rounded average mastery level for that skill.

    Parameters:
        skill_list (list of tuples): A list where each element is a tuple containing:
            - skill (str): The skill identifier.
            - mastery_level (str or int): The mastery level of the skill, which could be a string (e.g., 'beginner', 'expert') or an int.
        skills2int (dict): A dictionary mapping skill identifiers/ ID (str) to integer indices. (e.g., 1000: 0)
        mastery_levels (dict): A dictionary mapping mastery level strings to integer values. (e.g., {'beginner': 1, 'intermediate': 2})
        replace_unk (int): The value to replace any unknown mastery levels with (default is -1).

    Returns:
        dict: A dictionary where the keys are the integer taxonomy level 3 skill indices (from `skills2int`), 
              and the values are the rounded average mastery levels for each skill.
        """
        avg_skills = defaultdict(list)
        for skill, mastery_level in skill_list:
            #eg. skill = 1024 , mastery_level = 'beginner'
            # if the mastery level is a string and is in the mastery levels, we replace it with the corresponding value, otherwise we do nothing and continue to the next skill
            if isinstance(mastery_level, str) and mastery_level in mastery_levels:
                mastery_level = mastery_levels[mastery_level] #mapping to an integer
                if mastery_level == -1:
                    mastery_level = replace_unk
                # mapping to an integer which is the id of taxonomy level
                # Mapping skills type 4 of learners to type 3, 
                # so the number of skills may be less than or equal to the original number of skills
                skill = skills2int[skill] 
                avg_skills[skill].append(mastery_level)
        #print(avg_skills) #output :e.g  {4: [3, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 1],...}
        # we take the average of the mastery levels for each skill because on our dataset we can have multiple mastery levels for the same skill (taxonomy)
        for skill in avg_skills.keys():
            avg_skills[skill] = sum(avg_skills[skill]) / len(avg_skills[skill])
            avg_skills[skill] = round(avg_skills[skill])

        return avg_skills #output : e.g {4: 2, 28: 2,...}

In [19]:
def load_learners(config, skills, skills2int,mastery_levels,replace_unk=1):
        """
        Loads learners' skill proficiency data from a specified file and stores it in a numpy array,
        where each learner's skills are represented by their proficiency levels.

        This function reads the learner data from a JSON file, processes their skill proficiency levels,
        and stores them in a numpy array. The proficiency levels are adjusted based on the mastery levels
        provided, and unknown mastery levels are replaced with a specified value (`replace_unk`).

        The function also ensures that only learners with a number of skills less than or equal to a specified
        maximum (`max_cv_skills`) are considered. The numpy array is populated with each learner's skill proficiency levels,
        with rows corresponding to learners and columns corresponding to skills.

        Parameters:
        config (dict): A configuration dictionary containing the following keys:
            - "cv_path" (str): Path to the JSON file containing the learners' skill data.
            - "max_cv_skills" (int): The maximum number of skills a learner can have. Learners with more skills are skipped.
        skills (set): A set of skill identifiers (usually integer indices).
        skills2int (dict): A dictionary mapping skill identifiers to integer indices.
        mastery_levels (dict): A dictionary mapping mastery level strings (e.g., 'beginner', 'expert') to integer values.
        replace_unk (int, optional): The value used to replace unknown mastery levels. Defaults to 1.

        Returns:
        numpy.ndarray: A numpy array of shape (num_learners, num_skills) where each element represents
                        the proficiency level of a learner for a particular skill.
                        Learners with more skills than `max_cv_skills` are excluded.
        """
        learners = json.load(open(os.path.join("../../", config["cv_path"])))
        #display(learners)  #output:'36856210': [[1024, 'expert'],[2017, 'expert'],...],
        #print(len(learners)) #output: 238 (total learners)
        max_learner_skills = config["max_cv_skills"]
        learners_index = dict()

        # numpy array to store the learners skill proficiency levels with default value 0
        # learners_array correspond to self.learners in POO
        learners_array = np.zeros((len(learners), len(skills)), dtype=int)
        index = 0

        # fill the numpy array with the learners skill proficiency levels from the json file
        for learner_id, learner in learners.items():
            #eg. learner_id = 36856210 , learner = [[1024, 'expert'],[2017, 'expert'],...]
            ## Note that: the number of skills of learner after apply this function is < or = the original
            avg_learner = get_avg_skills(learner,skills2int,mastery_levels, replace_unk)

            # if the number of skills is greater than the max_learner_skills, we skip the learner
            if len(avg_learner) > max_learner_skills:
                continue #donc index isnt counted

            # we fill the numpy array with the averaged mastery levels
            for skill, level in avg_learner.items():
                #e.g skill = 4 (taxo skill lv3) , level = 2 (avg mastery level)
                learners_array[index][skill] = level

            learners_index[index] = learner_id
            #learners_index[learner_id] = index #??????? no interest, double the number of learners

            index += 1
        #print(len(learners_index))
        print(learners_index)
        print("\nlearners_array before update")
        print(f"shape:({learners_array.shape[0]} learners, {learners_array.shape[1]} skills)")
        print(learners_array)
    
        # we update the learners numpy array with the correct number of rows ( condition < or = 15 skills)
        learners_array = learners_array[:index]
        print("\nlearners_array after update")
        print(f"shape:({learners_array.shape[0]} learners, {learners_array.shape[1]} skills)")
        print(learners_array)
        return learners_array

In [20]:
learners_array = load_learners(config,skills,skills2int,mastery_levels)

{0: '10839851', 1: '39718499', 2: '321', 3: '322', 4: '324', 5: '325', 6: '326', 7: '3243', 8: '3285', 9: '3286', 10: '3289', 11: '32125', 12: '32128', 13: '32129', 14: '32130', 15: '32131', 16: '32133', 17: '32134', 18: '32135', 19: '32137', 20: '32138', 21: '32209', 22: '32213', 23: '32214', 24: '32240', 25: '32242', 26: '32267', 27: '32328', 28: '32329', 29: '32332', 30: '32376', 31: '32380', 32: '32430', 33: '32433', 34: '32489', 35: '32493', 36: '32519', 37: '32523', 38: '32564', 39: '32600', 40: '32601', 41: '32603', 42: '32604', 43: '32606', 44: '32628', 45: '32629', 46: '32631', 47: '32632', 48: '32668', 49: '32669', 50: '32673', 51: '32674'}

learners_array before update
shape:(238 learners, 46 skills)
[[2 2 0 ... 0 0 0]
 [0 2 0 ... 0 0 3]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

learners_array after update
shape:(52 learners, 46 skills)
[[2 2 0 ... 0 0 0]
 [0 2 0 ... 0 0 3]
 [1 0 0 ... 0 0 0]
 ...
 [1 1 0 ... 0 0 0]
 [2 2 0 ... 0 0 0]

In [12]:
learners = json.load(open(os.path.join("../../", config["cv_path"])))
print("nb of skills type 4 of learner 325: ", len(learners["325"]))
print(learners["325"], "\n") 
      
learner_ids = ["10839851", "325"] #check the learner with id 0 and id 5 in learners_index
all_avg_skills = {}

for learner_id in learner_ids:
    avg_skills = get_avg_skills(learners[learner_id], skills2int, mastery_levels, replace_unk=1)
    print(f"nb of skills type 3 of learner {learner_id}: {len(avg_skills)}\n")
    all_avg_skills[learner_id] = avg_skills

print(all_avg_skills)

nb of skills type 4 of learner 325:  11
[[1029, 'beginner'], [2422, 'intermediate'], [2348, 'intermediate'], [2142, 'intermediate'], [1072, 'intermediate'], [1414, 'intermediate'], [2441, 'intermediate'], [1065, 'unknown'], [2721, 'beginner'], [2516, 'beginner'], [1727, 'beginner']] 

nb of skills type 3 of learner 10839851: 14

nb of skills type 3 of learner 325: 8

{'10839851': defaultdict(<class 'list'>, {4: 1, 36: 2, 0: 2, 8: 1, 33: 1, 19: 2, 5: 1, 9: 2, 14: 1, 1: 2, 28: 1, 37: 1, 26: 2, 27: 2}), '325': defaultdict(<class 'list'>, {0: 2, 7: 2, 24: 2, 19: 2, 14: 1, 25: 1, 1: 1, 18: 1})}


#### Confirm that `learners_array` reflects the correct mapping: rows correspond to learners, and the first column represents skill type 3 with id 1. There should be a total of 46 type 3 skills.