In [1]:
import json
import random
import os
import argparse

import pandas as pd
import numpy as np

from collections import defaultdict
import yaml
from IPython.display import display

In [2]:
print(os.listdir(".."))
print(os.getcwd())
print(os.path.exists("Code/config/run.yaml"))

['.gitignore', 'config', 'Experiment', 'jcrec', 'requirements.txt', 'results']
C:\Users\minhm\OneDrive - Université Côte d'Azur\I3S_CNRS_Internship\Project\Code\Experiment
False


In [3]:
config_path = "../config/run.yaml"

with open(config_path, "r") as f:
        config = yaml.load(f, Loader=yaml.FullLoader)

In [4]:
path = os.path.join("../../", config["taxonomy_path"]) #test path
print(path)

../../Data - Collection/Final/taxonomy.csv


In [5]:
def load_skills(config):
        """
        Loads skills from a taxonomy file into the instance, processes them based on configuration,
        and creates a mapping of skills to integer indices.

        The method reads a CSV file specified in the configuration and processes the skills
        either by extracting unique values from the 'Type Level 3' column (if level_3 is True)
        or using the 'unique_id' column (if level_3 is False). It populates `skills` with
        a set of skills and `skills2int` with a dictionary mapping skills to integer indices.

        Attributes Modified:
            skills (set): A set of unique skill identifiers or level 3 types.
            skills2int (dict): A dictionary mapping skill identifiers to integer indices.
        """
        # load the skills from the taxonomy file
        skills = pd.read_csv(os.path.join("../../", config["taxonomy_path"]))
        display(skills.head(3))
        print("\n")
        print(skills.columns)
        print(f"Total skills :{len(skills)}")
        # if level_3 is true, we only use the level 3 of the skill taxonomy, then we need to get the unique values in column Type Level 3
        ## Note: A single taxonomy skill may be shared across multiple skills. Using Level 3 taxonomy is preferred
        # as it maintains effective skill categorization. Levels 1 or 2 are too broad, resulting in overly general domains.
        if config["level_3"]:
            # get all the unique values in column Type Level 3
            level2int = {
                level: i for i, level in enumerate(skills["Type Level 3"].unique())
            }

            # make a dict from column unique_id to column Type Level 3
            skills_dict = dict(
                zip(skills["unique_id"], skills["Type Level 3"])
            )

            # map skills_dict values to level2int
            skills2int = {
                key: level2int[value] for key, value in skills_dict.items()
            }
            skills = set(skills2int.values())
            print(f"total taxonomy skills :",len(skills), "\n")
            #print(level2int) #output : software and applications development and analysis : 0
            #print(skills_dict) #output : 1000: software and applications development and analysis
            #print(skills2int) #output : 1000: 0
            return skills,skills2int
        # if level_3 is false, we use the unique_id column as the skills ~ no taxonomy levels
        else:
            skills = set(skills["unique_id"])
            skills2int = {skill: i for i, skill in enumerate(skills)}
            #print(len(skills)) #output : 1794
            #print(len(skills2int)) #output : 1794
            return skills,skills2int

In [6]:
skills,skills2int = load_skills(config=config)

Unnamed: 0.1,Unnamed: 0,Source,Type Level 4,altLabels,Definition,Dimension,Type Level 1,Type Level 2,Type Level 3,unique_id,name,name+definition
0,0,http://data.europa.eu/esco/skill/000f1d3d-220f...,Haskell,Haskell,The techniques and principles of software deve...,knowledge,information and communication technologies (icts),information and communication technologies (icts),software and applications development and anal...,1000,Haskell,Haskell : the techniques and principles of sof...
1,2,http://data.europa.eu/esco/skill/0037c821-2898...,develop energy saving concepts,developing concepts for energy saving\r\ndevel...,Use current research results and collaborate w...,skills,management skills,developing objectives and strategies,developing operational policies and procedures,1001,develop energy saving concepts,develop energy saving concepts : use current r...
2,4,http://data.europa.eu/esco/skill/0058526a-11e9...,conduct research on flora,carry out research on flora\r\nflora research\...,Collect and analyse data about plants in order...,skills,information skills,analysing and evaluating information and data,analysing scientific and medical data,1002,conduct research on flora,conduct research on flora : collect and analys...




Index(['Unnamed: 0', 'Source', 'Type Level 4', 'altLabels', 'Definition',
       'Dimension', 'Type Level 1', 'Type Level 2', 'Type Level 3',
       'unique_id', 'name', 'name+definition'],
      dtype='object')
Total skills :1794
total taxonomy skills : 46 



In [7]:
def load_mastery_levels(config):
        """Load the mastery levels from the file specified in the config and store it in the class attribute"""
        mastery_levels = json.load(open(os.path.join("../../", config["mastery_levels_path"])))
        print(mastery_levels)
        return mastery_levels

In [8]:
mastery_levels = load_mastery_levels(config)

{'beginner': 1, 'intermediate': 2, 'expert': 3, 'unknown': -1}


### Load courses, jobs, learners functions

The functions described below compute matrices of individuals × skills, where each skill value is calculated as the average of mastery levels.

When working with taxonomy level 3 (skill type 3), a single type 3 skill can be shared by multiple type 4 skills. Averaging helps maintain consistency across mappings.

In [9]:
def get_avg_skills(skill_list,skills2int,mastery_levels, replace_unk):
        """
    Calculates the average mastery level for each skill in the provided skill list.

        This function processes a list of skills, each with a corresponding mastery level, 
        and computes the average mastery level for each skill. The mastery levels are 
        adjusted if they are in string format and match a predefined set of mastery levels. 
        If a mastery level is not recognized, it is replaced with the specified `replace_unk` value. 

        The function assumes that a single skill may have multiple mastery levels associated with it, 
        and it calculates the average for each skill. The result is a dictionary where the key is 
        the integer index of the skill, and the value is the rounded average mastery level for that skill.

    Parameters:
        skill_list (list of tuples): A list where each element is a tuple containing:
            - skill (str): The skill identifier.
            - mastery_level (str or int): The mastery level of the skill, which could be a string (e.g., 'beginner', 'expert') or an int.
        skills2int (dict): A dictionary mapping skill identifiers/ ID (str) to integer indices. (e.g., 1000: 0)
        mastery_levels (dict): A dictionary mapping mastery level strings to integer values. (e.g., {'beginner': 1, 'intermediate': 2})
        replace_unk (int): The value to replace any unknown mastery levels with (default is -1).

    Returns:
        dict: A dictionary where the keys are the integer taxonomy level 3 skill indices (from `skills2int`), 
              and the values are the rounded average mastery levels for each skill.
        """
        avg_skills = defaultdict(list)
        for skill, mastery_level in skill_list:
            #eg. skill = 1024 , mastery_level = 'beginner'
            # if the mastery level is a string and is in the mastery levels, we replace it with the corresponding value, otherwise we do nothing and continue to the next skill
            if isinstance(mastery_level, str) and mastery_level in mastery_levels:
                mastery_level = mastery_levels[mastery_level] #mapping to an integer
                if mastery_level == -1:
                    mastery_level = replace_unk
                # mapping to an integer which is the id of taxonomy level
                # Mapping skills type 4 of learners to type 3, 
                # so the number of skills may be less than or equal to the original number of skills
                skill = skills2int[skill] 
                avg_skills[skill].append(mastery_level)
        #print(avg_skills) #output :e.g  {4: [3, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 1],...}
        # we take the average of the mastery levels for each skill because on our dataset we can have multiple mastery levels for the same skill (taxonomy)
        for skill in avg_skills.keys():
            avg_skills[skill] = sum(avg_skills[skill]) / len(avg_skills[skill])
            avg_skills[skill] = round(avg_skills[skill])

        return avg_skills #output : e.g {4: 2, 28: 2,...}

In [10]:
def load_learners(config, skills, skills2int,mastery_levels,replace_unk=1): 
        """
        Loads learners' skill proficiency data from a specified file and stores it in a numpy array,
        where each learner's skills are represented by their proficiency levels.

        This function reads the learner data from a JSON file, processes their skill proficiency levels,
        and stores them in a numpy array. The proficiency levels are adjusted based on the mastery levels
        provided, and unknown mastery levels are replaced with a specified value (`replace_unk`).

        The function also ensures that only learners with a number of skills less than or equal to a specified
        maximum (`max_cv_skills`) are considered. The numpy array is populated with each learner's skill proficiency levels,
        with rows corresponding to learners and columns corresponding to skills.

        Parameters:
        config (dict): A configuration dictionary containing the following keys:
            - "cv_path" (str): Path to the JSON file containing the learners' skill data.
            - "max_cv_skills" (int): The maximum number of skills a learner can have. Learners with more skills are skipped.
        skills (set): A set of skill identifiers (usually integer indices).
        skills2int (dict): A dictionary mapping skill identifiers to integer indices.
        mastery_levels (dict): A dictionary mapping mastery level strings (e.g., 'beginner', 'expert') to integer values.
        replace_unk (int, optional): The value used to replace unknown mastery levels. Defaults to 1. (consider that learner is beginner)

        Returns:
        numpy.ndarray: A numpy array of shape (num_learners, num_skills) where each element represents
                        the proficiency level of a learner for a particular skill.
                        Learners with more skills than `max_cv_skills` are excluded.
        """
        learners = json.load(open(os.path.join("../../", config["cv_path"])))
        #display(learners)  #output:'36856210': [[1024, 'expert'],[2017, 'expert'],...],
        #print(len(learners)) #output: 238 (total learners)
        max_learner_skills = config["max_cv_skills"]
        learners_index = dict()

        # numpy array to store the learners skill proficiency levels with default value 0
        # learners_array correspond to self.learners in POO
        learners_array = np.zeros((len(learners), len(skills)), dtype=int)
        index = 0

        # fill the numpy array with the learners skill proficiency levels from the json file
        for learner_id, learner in learners.items():
            #eg. learner_id = 36856210 , learner = [[1024, 'expert'],[2017, 'expert'],...]
            ## Note that: the number of skills of learner after apply this function is < or = the original
            avg_learner = get_avg_skills(learner,skills2int,mastery_levels, replace_unk)

            # if the number of skills is greater than the max_learner_skills, we skip the learner
            if len(avg_learner) > max_learner_skills:
                continue #donc index isnt counted

            # we fill the numpy array with the averaged mastery levels
            for skill, level in avg_learner.items():
                #e.g skill = 4 (taxo skill lv3) , level = 2 (avg mastery level)
                learners_array[index][skill] = level

            learners_index[index] = learner_id
            learners_index[learner_id] = index #??????? no interest, double the number of learners

            index += 1
        #print(len(learners_index))
        print(learners_index)
        print("\nlearners_array before update")
        print(f"shape:({learners_array.shape[0]} learners, {learners_array.shape[1]} skills)")
        print(learners_array)
    
        # we update the learners numpy array with the correct number of rows ( condition < or = 15 skills)
        learners_array = learners_array[:index]
        print("\nlearners_array after update")
        print(f"shape:({learners_array.shape[0]} learners, {learners_array.shape[1]} skills)")
        print(learners_array)
        return learners_array, learners_index

In [11]:
learners_array, learners_index = load_learners(config,skills,skills2int,mastery_levels)

{0: '10839851', '10839851': 0, 1: '39718499', '39718499': 1, 2: '321', '321': 2, 3: '322', '322': 3, 4: '324', '324': 4, 5: '325', '325': 5, 6: '326', '326': 6, 7: '3243', '3243': 7, 8: '3285', '3285': 8, 9: '3286', '3286': 9, 10: '3289', '3289': 10, 11: '32125', '32125': 11, 12: '32128', '32128': 12, 13: '32129', '32129': 13, 14: '32130', '32130': 14, 15: '32131', '32131': 15, 16: '32133', '32133': 16, 17: '32134', '32134': 17, 18: '32135', '32135': 18, 19: '32137', '32137': 19, 20: '32138', '32138': 20, 21: '32209', '32209': 21, 22: '32213', '32213': 22, 23: '32214', '32214': 23, 24: '32240', '32240': 24, 25: '32242', '32242': 25, 26: '32267', '32267': 26, 27: '32328', '32328': 27, 28: '32329', '32329': 28, 29: '32332', '32332': 29, 30: '32376', '32376': 30, 31: '32380', '32380': 31, 32: '32430', '32430': 32, 33: '32433', '32433': 33, 34: '32489', '32489': 34, 35: '32493', '32493': 35, 36: '32519', '32519': 36, 37: '32523', '32523': 37, 38: '32564', '32564': 38, 39: '32600', '32600':

In [12]:
learners = json.load(open(os.path.join("../../", config["cv_path"])))
print("nb of skills type 4 of learner 325: ", len(learners["325"]))
print(learners["325"], "\n") 
      
learner_ids = ["10839851", "325"] #check the learner with id 0 and id 5 in learners_index
all_avg_skills = {}

for learner_id in learner_ids:
    avg_skills = get_avg_skills(learners[learner_id], skills2int, mastery_levels, replace_unk=1)
    print(f"nb of skills type 3 of learner {learner_id}: {len(avg_skills)}\n")
    all_avg_skills[learner_id] = avg_skills

print(avg_skills)
print(all_avg_skills)

nb of skills type 4 of learner 325:  11
[[1029, 'beginner'], [2422, 'intermediate'], [2348, 'intermediate'], [2142, 'intermediate'], [1072, 'intermediate'], [1414, 'intermediate'], [2441, 'intermediate'], [1065, 'unknown'], [2721, 'beginner'], [2516, 'beginner'], [1727, 'beginner']] 

nb of skills type 3 of learner 10839851: 14

nb of skills type 3 of learner 325: 8

defaultdict(<class 'list'>, {0: 2, 7: 2, 24: 2, 19: 2, 14: 1, 25: 1, 1: 1, 18: 1})
{'10839851': defaultdict(<class 'list'>, {4: 1, 36: 2, 0: 2, 8: 1, 33: 1, 19: 2, 5: 1, 9: 2, 14: 1, 1: 2, 28: 1, 37: 1, 26: 2, 27: 2}), '325': defaultdict(<class 'list'>, {0: 2, 7: 2, 24: 2, 19: 2, 14: 1, 25: 1, 1: 1, 18: 1})}


#### Confirm that `learners_array` reflects the correct mapping: rows correspond to learners, and the first column represents skill type 3 with id 1. There should be a total of 46 type 3 skills.

In [13]:
def load_jobs(config, skills,skills2int, mastery_levels ,replace_unk=3):
        """Load the jobs from the file specified in the config and store it in the class attribute

        Args:
            replace_unk (int, optional): The value to replace the unknown mastery levels. Defaults to 3. (consider that the required skill level is expert)
        """
        jobs = json.load(open(os.path.join("../../", config["job_path"])))
        #print(len(jobs)) total jobs = 3587
        #print(jobs)   #output : {'15': [[1142, 'unknown'], [1094, 'unknown'],...],'4976': [[1032, 'expert'],...]
        
        # numpy array to store the job skill proficiency levels with default value 0
        # jobs_array correspond to self.jobs in POO
        jobs_array = np.zeros((len(jobs), len(skills)), dtype=int)
        jobs_index = dict()
        index = 0

        
        for job_id, job in jobs.items():
            jobs_index[index] = job_id
            jobs_index[job_id] = index
            avg_job = get_avg_skills(job, skills2int, mastery_levels ,replace_unk)

            for skill, level in avg_job.items():
                jobs_array[index][skill] = level
            index += 1
        print(jobs_array)
        print(f"\nshape:({jobs_array.shape[0]} jobs, {jobs_array.shape[1]} skills)")
        #print(jobs_index) #output : {0: '15', '15': 0, 1: '16', '16': 1,...}

        return jobs_array, jobs_index

In [14]:
jobs_array,jobs_index = load_jobs(config, skills,skills2int, mastery_levels ,replace_unk=3)

[[2 0 0 ... 0 0 0]
 [2 3 0 ... 0 3 0]
 [3 2 0 ... 0 0 0]
 ...
 [3 0 0 ... 0 0 0]
 [3 3 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

shape:(3587 jobs, 46 skills)


In [15]:
def load_courses(config,skills,skills2int, mastery_levels, replace_unk=2):
        """
    Loads course-related data and encodes it into a structured array format.

    The function parses a course dataset defined in the configuration. Each course may have:
    - Required skills ("required")
    - Skills to be acquired after the course ("to_acquire")

    Skills are mapped to integer IDs based on the taxonomy, and mastery levels are standardized 
    using a predefined mapping. If a level is unknown or unrecognized, a default value is used.

    The output is a 3D numpy array with:
        - Axis 0: individual courses
        - Axis 1: type of skill (0 = required, 1 = to acquire)
        - Axis 2: encoded skill taxonomy (based on skills2int)

    Args:
        config (dict): Configuration dictionary containing the course file path.
        skills (set): A set of skill identifiers used as reference for array dimensions.
        skills2int (dict): Mapping from skill unique IDs to taxonomy-level integer indices.
        mastery_levels (dict): Mapping of mastery level labels (e.g. 'beginner') to integers.
        replace_unk (int, optional): Default value used for unknown skill levels (default is 2).

    Returns:
        tuple: 
            - courses_array (np.ndarray): Structured array of shape (n_courses, 2, n_skills).
            - courses_index (dict): Mapping between course IDs and internal indices.
        """
        courses = json.load(open(os.path.join("../../", config["course_path"])))
        #print(courses) #ouput : {'11475': {'required': [[1692, 'beginner'],...],'to_acquire': [[1072, 'beginner'],..]}, ...}
        #print(len(courses))   #output : 3000

        # numpy array to store the courses skill proficiency levels with default value 0
        # courses_array correspond to self.courses in POO
        courses_array = np.zeros((len(courses), 2, len(skills)), dtype=int)
        courses_index = dict()
        index = 0
        for course_id, course in courses.items():
            # if the course does not provide any skills, we skip it
            if "to_acquire" not in course:
                continue

            courses_index[course_id] = index
            courses_index[index] = course_id

            avg_provided = get_avg_skills(course["to_acquire"],skills2int, mastery_levels, replace_unk)
            for skill, level in avg_provided.items():
                courses_array[index][1][skill] = level

            if "required" in course:
                avg_required = get_avg_skills(course["required"],skills2int, mastery_levels, replace_unk)
                for skill, level in avg_required.items():
                    courses_array[index][0][skill] = level

            index += 1
        # update the courses numpy array with the correct number of rows
        courses_array = courses_array[:index]
        #print(courses_array)
        print(f"\nshape: {courses_array.shape[0]} x {courses_array.shape[1]} x {courses_array.shape[2]} (courses x required/provided x skills)")


        return courses_array, courses_index

In [16]:
courses_array, courses_index=load_courses(config,skills,skills2int, mastery_levels, replace_unk=2)


shape: 3000 x 2 x 46 (courses x required/provided x skills)


eg : 
```python
courses_array = [
  [  # Course 0
    [1, 0, 0, ..., 0],   # index 0: required skills (dim :46)
    [0, 2, 0, ..., 0]    # index 1: provided skills
  ],
  [  # Course 1
    [0, 0, 0, ..., 0],
    [3, 0, 0, ..., 1]
  ],
  ...
]
```

In [17]:
def get_subsample(config, learners_array, learners_index, jobs_array, jobs_index, courses_array,courses_index):
        """
       Randomly subsample learners, jobs, and courses based on config settings.

    Uses fixed seed to ensure reproducibility. If the number of desired learners, jobs,
    or courses is specified (non -1), it selects a random subset of the corresponding arrays
    and reindexes the index mappings accordingly.

    Args:
        config (dict): Configuration with subsample sizes and random seed.
        learners_array (np.ndarray): Matrix of learners × skills.
        learners_index (dict): Index mapping between array row and learner ID.
        jobs_array (np.ndarray): Matrix of jobs × skills.
        jobs_index (dict): Index mapping between array row and job ID.
        courses_array (np.ndarray): 3D array [course][required/provided][skill].
        courses_index (dict): Index mapping between array row and course ID.

    Returns:
        tuple: Subsampled arrays and updated index mappings.
        """
        random.seed(config["seed"])
        if config["nb_cvs"] != -1:
            # get a random sample of self.config["nb_cvs"] of ids from 0 to len(self.learners)
            learners_ids = random.sample(
                range(len(learners_array)), config["nb_cvs"]
            )
            # update the learners numpy array and the learners_index dictionary with the sampled ids
            learners_array = learners_array[learners_ids]
            learners_index = {
                i: learners_index[index] for i, index in enumerate(learners_ids)
            }
            learners_index.update({v: k for k, v in learners_index.items()})
        if config["nb_jobs"] != -1:
            jobs_ids = random.sample(range(len(jobs_array)), config["nb_jobs"])
            jobs_array = jobs_array[jobs_ids]
            jobs_index = {
                i: jobs_index[index] for i, index in enumerate(jobs_ids)
            }
            jobs_index.update({v: k for k, v in jobs_index.items()})
        if config["nb_courses"] != -1:
            courses_ids = random.sample(
                range(len(courses_array)), config["nb_courses"]
            )
            courses_array = courses_array[courses_ids]
            courses_index = {
                i: courses_index[index] for i, index in enumerate(courses_ids)
            }
            courses_index.update({v: k for k, v in courses_index.items()})

        return learners_array, learners_index, jobs_array, jobs_index, courses_array,courses_index

In [18]:
learners_array, learners_index, jobs_array, jobs_index, courses_array,courses_index = get_subsample(config, learners_array, learners_index, jobs_array, jobs_index, courses_array,courses_index)
print("=== Learners ===")
print("Learners array shape:", learners_array.shape)  # (num_learners, num_skills)
print("Number of learners in index:", len(learners_index) // 2)  # divise by 2 because id -> index và index -> id (structure)

print("\n=== Jobs ===")
print("Jobs array shape:", jobs_array.shape)  # (num_jobs, num_skills)
print("Number of jobs in index:", len(jobs_index) // 2)

print("\n=== Courses ===")
print("Courses array shape:", courses_array.shape)  # (num_courses, 2, num_skills)
print("Number of courses in index:", len(courses_index) // 2)


=== Learners ===
Learners array shape: (52, 46)
Number of learners in index: 52

=== Jobs ===
Jobs array shape: (100, 46)
Number of jobs in index: 100

=== Courses ===
Courses array shape: (100, 2, 46)
Number of courses in index: 100


In [19]:
def make_course_consistent(skills,courses_array):
        """
    Adjust course consistency by ensuring a course doesn't require a skill
    at the same or lower level than it provides.

    For each course:
        - If a skill is both required and provided:
            - And provided_level <= required_level:
                - Reduce the required level accordingly (or remove it if too low).

    Args:
        skills (list): List of all skill identifiers.
        courses_array (np.ndarray): Array of shape [num_courses, 2, num_skills],
            where index 0 = required skills, index 1 = provided skills.
        """
        for course in courses_array: 
            for skill_id in range(len(skills)):  # Loop through each skill in the course
                required_level = course[0][skill_id]  # Get the required level for the skill
                provided_level = course[1][skill_id]  # Get the provided level for the skill
        
                if provided_level != 0 and provided_level <= required_level:  
                   if provided_level == 1:  #the most basic level
                      course[0][skill_id] = 0  # Set the required level to 0, meaning this skill is no longer required
                   else:  # >1
                      course[0][skill_id] = provided_level - 1  # Decrease the required level by 1 to avoid conflict with the provided level


In [20]:
# # Print the first course before making it consistent
# print("Initial course (required vs provided):")
# course = courses_array[8]  #first course 
# print(f"Course 0:")
# print(f"  Required: {course[0]}")
# print(f"  Provided: {course[1]}")

# # Make the course consistent
# make_course_consistent(skills, courses_array)

# # Print the first course after making it consistent
# print("\nUpdated course (required vs provided):")
# course = courses_array[8]  #first course
# print(f"Course 0:")
# print(f"  Required: {course[0]}")
# print(f"  Provided: {course[1]}")

In [21]:
def get_jobs_inverted_index(jobs_array):
    """
    Create an inverted index for jobs based on the required skill levels.

    The function generates a mapping where each skill (represented by its index) 
    is associated with a set of job indices that require this skill. The level 
    of the skill is considered, so only skills with a level greater than 0 
    are included in the inverted index.

    Args:
        jobs_array (np.ndarray): A 2D array where each row represents a job 
                                  and each column represents a skill. The 
                                  value at each position (i, j) represents 
                                  the skill level for job i and skill j. 

    Returns:
        dict: A dictionary (`jobs_inverted_index`) where each key is a skill 
              (represented by the skill index) and each value is a set of job 
              indices that require this skill.
              
    Example:
        jobs_array = [
            [1, 0, 0],  # Job 0 requires skill 0 at level 1
            [0, 2, 0],  # Job 1 requires skill 1 at level 2
            [0, 0, 3]   # Job 2 requires skill 2 at level 3
        ]
        
        get_jobs_inverted_index(jobs_array)
        # Returns: 
        # {0: {0}, 1: {1}, 2: {2}} 
        # (Skill 0 is required by job 0, Skill 1 is required by job 1, and Skill 2 is required by job 2)
    """
    jobs_inverted_index = defaultdict(set)
    for i, job in enumerate(jobs_array):
        for skill, level in enumerate(job):
            if level > 0:  # Only consider skills with a level greater than 0
                jobs_inverted_index[skill].add(i)
    
    return jobs_inverted_index

In [22]:
jobs_inverted_index = get_jobs_inverted_index(jobs_array)

In [23]:
import sys
sys.path.append(os.path.abspath('../jcrec')) 
from matchings import*

In [24]:
def get_nb_applicable_jobs(learner, jobs_array,jobs_inverted_index, threshold):
    """Get the number of applicable jobs for a learner

    Args:
        learner (list): A list of skills and mastery levels for the learner. 
                        The length of the list corresponds to the number of skills, 
                        and the values represent the mastery level for each skill.
        threshold (float): The threshold value for the matching. 
                           Only jobs that meet or exceed this threshold will be considered applicable.

    Returns:
        int: The total number of jobs that match the learner's skills at or above the given threshold.
    """
    nb_applicable_jobs = 0  # Initialize the count of applicable job
    jobs_subset = set()  # Initialize a set to store jobs that match the learner's skills

    # Get the indices of the skills where the learner has a non-zero mastery level (knowledge base of learner)
    skills = np.nonzero(learner)[0] 
    #output eg. first learner : array([ 0,  1,  4,  5,  8,  9, 14, 19, 26, 27, 28, 33, 36, 37], dtype=int64)

    # This loop collects all distinct jobs that require at least one of the learner's skills 
    for skill in skills:
        if skill in jobs_inverted_index:  # Check if the skill is required by any jobs
           jobs_subset.update(jobs_inverted_index[skill])  # Add the elements (jobs) that require this skill to the jobs_subset

    # Then, match the learner against each job; if the matching score exceeds the threshold, count it as an applicable job
    for job_id in jobs_subset:
        matching = learner_job_matching(learner, jobs_array[job_id])  # Calculate the matching score for the job
        if matching >= threshold:  # If the matching score is above or equal to the threshold, the job is applicable
           nb_applicable_jobs += 1  # Increment the count of applicable jobs

    return nb_applicable_jobs

In [25]:
for i in range(10):  # nb applicable jobs for the first 10 learners
    applicable_jobs_low = get_nb_applicable_jobs(learners_array[i], jobs_array, jobs_inverted_index, 0.1)
    print(f"\nLearner {i} - Applicable jobs (threshold 0.1): {applicable_jobs_low}")

    applicable_jobs_high = get_nb_applicable_jobs(learners_array[i], jobs_array, jobs_inverted_index, 0.8)
    print(f"Learner {i} - Applicable jobs (threshold 0.8): {applicable_jobs_high}")


Learner 0 - Applicable jobs (threshold 0.1): 100
Learner 0 - Applicable jobs (threshold 0.8): 0

Learner 1 - Applicable jobs (threshold 0.1): 95
Learner 1 - Applicable jobs (threshold 0.8): 0

Learner 2 - Applicable jobs (threshold 0.1): 96
Learner 2 - Applicable jobs (threshold 0.8): 0

Learner 3 - Applicable jobs (threshold 0.1): 100
Learner 3 - Applicable jobs (threshold 0.8): 0

Learner 4 - Applicable jobs (threshold 0.1): 5
Learner 4 - Applicable jobs (threshold 0.8): 0

Learner 5 - Applicable jobs (threshold 0.1): 91
Learner 5 - Applicable jobs (threshold 0.8): 0

Learner 6 - Applicable jobs (threshold 0.1): 100
Learner 6 - Applicable jobs (threshold 0.8): 1

Learner 7 - Applicable jobs (threshold 0.1): 26
Learner 7 - Applicable jobs (threshold 0.8): 0

Learner 8 - Applicable jobs (threshold 0.1): 47
Learner 8 - Applicable jobs (threshold 0.8): 0

Learner 9 - Applicable jobs (threshold 0.1): 100
Learner 9 - Applicable jobs (threshold 0.8): 0


In [26]:
matching(learners_array[0],jobs_array[0]) # not matched

0.16666666666666666

In [27]:
np.minimum(learners_array[0],jobs_array[0])

array([2, 2, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0])

In [28]:
print(learners_array)
print(jobs_array)

[[2 2 0 ... 0 0 0]
 [0 2 0 ... 0 0 3]
 [1 0 0 ... 0 0 0]
 ...
 [1 1 0 ... 0 0 0]
 [2 2 0 ... 0 0 0]
 [1 2 0 ... 0 0 0]]
[[3 3 0 ... 0 0 0]
 [2 3 0 ... 0 0 0]
 [3 3 3 ... 0 0 0]
 ...
 [2 3 0 ... 0 3 0]
 [2 3 3 ... 0 0 0]
 [2 3 2 ... 0 0 0]]


In [29]:
def get_avg_applicable_jobs(learners_array,threshold):
        """Get the average number of applicable jobs for all the learners

        Args:
            threshold (float): the threshold for the matching

        Returns:
            float: the average number of applicable jobs
        """
        avg_applicable_jobs = 0
        for learner in learners_array:
            avg_applicable_jobs += get_nb_applicable_jobs(learner, jobs_array, jobs_inverted_index, threshold)
        avg_applicable_jobs /= len(learners_array)
        return avg_applicable_jobs

In [30]:
avg_applicable_jobs = get_avg_applicable_jobs(learners_array,0.8)
print(avg_applicable_jobs)

0.09615384615384616


In [31]:
def get_all_enrollable_courses(learner, courses_array, threshold):
        """Get all the enrollable courses for a learner

        Args:
            learner (list): list of skills and mastery level of the learner
            threshold (float): the threshold for the matching

        Returns:
            dict: dictionary of enrollable courses
        """
        enrollable_courses = {}
        for i, course in enumerate(courses_array):
            required_matching = learner_course_required_matching(
                learner, course
            )
            provided_matching = learner_course_provided_matching(
                learner, course
            )
            if required_matching >= threshold and provided_matching < 1.0:
                enrollable_courses[i] = course
        return enrollable_courses

In [32]:
enrollable_courses = get_all_enrollable_courses(learners_array[0], courses_array, threshold = 0.8)
print(enrollable_courses) #ouput : course_id : array 2d ( required skills,  provided skills)
print(len(enrollable_courses)) #32

{0: array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
        0, 0]]), 6: array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]]), 13: array([[2, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [2, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
        0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]]), 14: a

In [33]:
learner_course_required_matching(learners_array[0],courses_array[0])

1.0

#### Let's check if the return of 1 is due to the course having no required skills, or if it's because the course is relevant to the learner, where matching = 1.

In [38]:
np.any(courses_array[0][0]) #true if at least one element is not 0
#so,  return of 1 is NOT due to the course having no required skills

True

In [41]:
print(learners_array[0]) #learner's skill
print(courses_array[0][0]) #course's required skill
print(courses_array[0][1]) #course's required skill

[2 2 0 0 1 1 0 0 1 2 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 0 2 2 1 0 0 0 0 1 0 0 2
 1 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0]
[0 0 0 2 0 0 0 0 2 0 2 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 2 0 0 0 0 0 0]


In [37]:
matching(learners_array[0],courses_array[0][0]) # totally eligible

1.0

In [49]:
learner_course_provided_matching(learners_array[0],courses_array[0]) #learner does not know everything provided after attending the course.

0.21428571428571427


$\begin{aligned}
uc\text{-}rel(u,c)
  &=\; uc_r(u,c_r)\,\bigl(1 - uc_p(u,c_p)\bigr)
  \\[1ex]
uc_r(u,c_r)
  &=\; \dfrac{1}{\lvert c_r\rvert}
       \sum_{s\in c_r}
         \dfrac{\mathrm{sim}\bigl(sl_{s,u},\,sl_{s,c_r}\bigr)}{1}
  \\[1ex]
uc_p(u,c_p)
  &=\; \dfrac{1}{\lvert c_p\rvert}
       \sum_{s\in c_p}
         \dfrac{\mathrm{sim}\bigl(sl_{s,u},\,sl_{s,c_p}\bigr)}{1}
\end{aligned}$



In [52]:
learner_course_matching(learners_array[0],courses_array[0]) # pretty relevant course [0] - learner [0]

0.7857142857142857