In [17]:
from findhr.xai.counterfactual import dice_ml
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from collections import namedtuple
import ast
from findhr.preprocess.example_mappings import MatchBinary, MatchOrdinal, MatchFeatureSet, MatchFeatureInclusionSeparated
from findhr.preprocess.mapping import AttachMetadata, DetachMetadata, DerivedColumn
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRanker


## Define Helper Classes and Functions

In [18]:
# Note that the Age_j column needs to be split into Age_j_min and Age_j_max to generate the counterfactual explanations through the genetic algorithm

In [19]:
import pathlib

from dataclasses import dataclass

@dataclass
class MacroVariables:
    PATH = pathlib.Path("./code/DiCE_findhr/data")

    SUFFIX_DATASET = '1'  # '1' for demonstration, '2' for practice

    FILENAME_CURRICULA = "curricula{SUFFIX_DATASET}.csv"
    FILENAME_JOB_OFFERS = "job_offers{SUFFIX_DATASET}.csv"
    FILENAME_ADS_FAIR = 'score{SUFFIX_DATASET}_fair.csv'
    FILENAME_ADS_UNFAIR = 'score{SUFFIX_DATASET}_unfair.csv'

    FILENAME_FITNESS_MATRIX_FAIR = "fitness_mat{SUFFIX_DATASET}_fair.csv"
    FILENAME_FITNESS_MATRIX_UNFAIR = "fitness_mat{SUFFIX_DATASET}_unfair.csv"

    FILEPATH_CURRICULA = PATH / FILENAME_CURRICULA.format(SUFFIX_DATASET=SUFFIX_DATASET)
    FILEPATH_JOB_OFFERS = PATH / FILENAME_JOB_OFFERS.format(SUFFIX_DATASET=SUFFIX_DATASET)
    FILEPATH_ADS_FAIR = PATH / FILENAME_ADS_FAIR.format(SUFFIX_DATASET=SUFFIX_DATASET)
    FILEPATH_ADS_UNFAIR = PATH / FILENAME_ADS_UNFAIR.format(SUFFIX_DATASET=SUFFIX_DATASET)
    FILEPATH_FITNESS_MATRIX_FAIR = PATH / FILENAME_FITNESS_MATRIX_FAIR.format(SUFFIX_DATASET=SUFFIX_DATASET)
    FILEPATH_FITNESS_MATRIX_UNFAIR = PATH / FILENAME_FITNESS_MATRIX_UNFAIR.format(SUFFIX_DATASET=SUFFIX_DATASET)

    TOP_K = 10

    FAIR_DATA = True


In [20]:
from findhr.preprocess.metadata import JSONMetadata

# Define the metadata for the JDS dataset
md_JDS = {
    'qId': JSONMetadata(schema={'type': 'number'}),
    'Occupation_j': JSONMetadata(schema={'type': 'string'}),
    'Education_j': JSONMetadata(schema={'enum': ['No education', 'Degree', 'Bachelor D.', 'Master D.', 'PhD', 'Any']},
                              attr_type='category'),
    'Age_j_min': JSONMetadata(schema={'type': 'number'}),
    'Age_j_max': JSONMetadata(schema={'type': 'number'}),
    'Gender_j': JSONMetadata(schema={'enum': ['Male', 'Female', 'Non-binary', 'Any']},
                             attr_type='category', attr_usage='sensitive'),
    'Contract_j': JSONMetadata(schema={'enum': ['Remote', 'Hybrid', 'In presence']}),
    'Nationality_j': JSONMetadata(schema={'type': 'string'}),
    'Competences_j': JSONMetadata(schema={'type': "array", 'items': {'type': 'string'}}),
    'Knowledge_j': JSONMetadata(schema={'type': "array", 'items': {'type': 'string'} }),
    'Languages_j': JSONMetadata(schema={'type': "array", 'items': {'type': 'string'}}),
    'Experience_j': JSONMetadata(schema={'type': 'number'}),
}

# Define the metadata for the CDS dataset
md_CDS = {
    'kId': JSONMetadata(schema={'type': 'integer'}),
    'Occupation_c': JSONMetadata(schema={'type': 'string'}),
    'Education_c': JSONMetadata(schema={'enum': ['No education', 'Degree', 'Bachelor D.', 'Master D.', 'PhD', 'Any']},
                              attr_type='category'),
    'Age_c': JSONMetadata(schema={'type': 'number'}),
    'Gender_c': JSONMetadata(schema={'enum': ['Male', 'Female', 'Non-binary']},
                             attr_type='category', attr_usage='sensitive'),
    'Contract_c': JSONMetadata(schema={'enum': ['Remote', 'Hybrid', 'In presence', 'Any']}, attr_type='category'),
    'Nationality_c': JSONMetadata(schema={'type': 'string'}),
    'Competences_c': JSONMetadata(schema={'type': "array", 'items': {'type': 'string'}}),
    'Knowledge_c': JSONMetadata(schema={'type': "array", 'items': {'type': 'string'}}),
    'Experience_c': JSONMetadata(schema={'type': 'number'}),
    'Languages_c': JSONMetadata(schema={'type': "array",'items': {'type': 'string'}}),
}

md_ADS = {
    'rank': JSONMetadata(schema={'type': 'number', 'attr_usage':'target'}),
    'score': JSONMetadata(schema={'type': 'number', 'attr_usage':'target'}),
}
md_CDS_JDS_ADS = {**md_CDS, **md_JDS, **md_ADS}


In [21]:
def rank2relevance(df, top_k, col_rank):
    return top_k + 1 - df[col_rank].values.ravel()

In [22]:
# Define hyperparameters for the data split
TEST_SIZE = 0.2

VAL_SIZE = 0.25 # 0.25 x 0.8 = 0.2

RANDOM_STATE = 42

## Data Loading and Splitting

In [33]:
def data_split(df_qId_kId):
    all_jobs = df_qId_kId['qId'].unique()
    train_jobs, test_jobs = train_test_split(all_jobs, test_size=TEST_SIZE, random_state=RANDOM_STATE, shuffle=False)
    train_jobs, val_jobs = train_test_split(train_jobs, test_size=VAL_SIZE, random_state=RANDOM_STATE, shuffle=False)

    # Build train, test and validation sets, ensuring they are sorted by qId, kId
    df_train = df_qId_kId[df_qId_kId['qId'].isin(train_jobs)].sort_values(["qId", "kId"])
    df_val = df_qId_kId[df_qId_kId['qId'].isin(val_jobs)].sort_values(["qId", "kId"])
    df_test = df_qId_kId[df_qId_kId['qId'].isin(test_jobs)].sort_values(["qId", "kId"])

    return df_train, df_val, df_test


def convert_cols_mod(x):
    if isinstance(x, int) or isinstance(x, float):
        return x
    elif isinstance(x, list):
        return tuple(x)
    else:
        try:
            x = ast.literal_eval(x)
        finally:
            if isinstance(x, list):
                return tuple(x)
            return x

def load_dataset(fair_data=True):
    # Read dataset
    df_JDS = pd.read_csv(MacroVariables.FILEPATH_JOB_OFFERS,  # converters for columns of lists of values
                         converters={c: convert_cols_mod for c in [
                             # "Age_j",
                             "Competences_j", "Knowledge_j", "Languages_j"]})
    # Split the Age_j column into Age_j_min and Age_j_max to use MatchFeatureInclusionSeparated
    df_JDS['Age_j_min'] = df_JDS['Age_j'].apply(lambda x: ast.literal_eval(x)[0])
    df_JDS['Age_j_max'] = df_JDS['Age_j'].apply(lambda x: ast.literal_eval(x)[1])
    df_JDS.drop(columns=['Age_j'], inplace=True)

    # Use only the first 20 rows for demonstration purposes
    # df_JDS = df_JDS.iloc[:20]
    df_CDS = pd.read_csv(MacroVariables.FILEPATH_CURRICULA,  # converters for columns of lists of values
                         converters={c: convert_cols_mod for c in [
                             "Age_c",
                             "Experience_c", "Competences_c", "Knowledge_c", "Languages_c"]})
    # df_CDS = df_CDS.iloc[:20]
    # df_JDS = df_JDS.iloc[:20]
    # df_CDS.drop(columns=['Age_c'], inplace=True)
    # df_JDS.drop(columns=['Age_j'], inplace=True)

    df_ADS_FAIR = pd.read_csv(MacroVariables.FILEPATH_ADS_FAIR)
    df_ADS_UNFAIR = pd.read_csv(MacroVariables.FILEPATH_ADS_UNFAIR)


    cols_dict_HUDD = define_cols_dict_HUDD()


    if fair_data:
        # Merge CDS and JDS through ADS in a single dataframe
        df_CDS_JDS = pd.merge(df_ADS_FAIR, df_JDS, on='qId')
    else:
        # Merge CDS and JDS through ADS in a single dataframe
        df_CDS_JDS = pd.merge(df_ADS_UNFAIR, df_JDS, on='qId')

    df_CDS_JDS = pd.merge(df_CDS, df_CDS_JDS, on='kId')
    df_CDS_JDS = df_CDS_JDS[cols_dict_HUDD['cols_id'] + [col for col in df_CDS_JDS if col not in cols_dict_HUDD['cols_id']+ cols_dict_HUDD['col_target']] +
                            cols_dict_HUDD['col_target']]
    df_CDS_JDS[cols_dict_HUDD['col_rank']] = np.minimum(df_CDS_JDS.groupby("qId")[cols_dict_HUDD['col_target']].rank('dense', ascending=False), MacroVariables.TOP_K + 1)

    # dict_multilabelbinarizer = {}
    # for col in cols_dict_HUDD['setlist_features']:
    #     df_CDS_JDS, mlb = convert_skills(df_CDS_JDS, col)
    #     dict_multilabelbinarizer[col] = mlb

    # TODO: Change the original data
    return df_CDS_JDS, cols_dict_HUDD# , dict_multilabelbinarizer

In [34]:
def define_cols_dict_HUDD():
    """
    Define the columns of the HUDD dataset

    Returns:
    ----------
    cols_dict_HUDD: dict
        Dictionary with the columns of the HUDD dataset
    """
    # Define subsets of columns
    cols_id = ['qId', 'kId']

    # Define the subset of columns of the HUDD dataset describing the candidate,
    # which are used in the preprocessing+prediction pipeline
    cols_c = ['Education_c',
              'Age_c',
              'Gender_c', 'Contract_c',
              'Nationality_c', 'Competences_c', 'Knowledge_c', 'Languages_c',
              'Experience_c']
    cols_j = ['Education_j',
              'Age_j',
              'Gender_j',  'Contract_j', 'Nationality_j', 'Competences_j',
              'Knowledge_j', 'Languages_j', 'Experience_j']
    cols_pred_preprocess = cols_c + cols_j
    cols_not_for_pred = ['Occupation_c', 'Occupation_j']
    cols_sensitive = ['Gender_c']
    col_target = ['score']
    col_rank = ['rank']

    # Define the subset of columns of the HUDD dataset for the counterfactual explanation
    outcome_name_col = 'lambda'  # 'pred_rank'
    continuous_features = [
        'Age_c',
        'Experience_c', 'Experience_j',
    'Age_j_min', 'Age_j_max']  # ['Age_c', 'Experience_c'],
    categorical_features = ['Education_c', 'Gender_c', 'Contract_c', 'Nationality_c',
                            # 'Competences_c', 'Knowledge_c', 'Languages_c',
                            'Education_j', 'Gender_j', 'Contract_j', 'Nationality_j',
                            # 'Competences_j', 'Knowledge_j', 'Languages_j',
                            # 'Age_j'
                            ]
    setlist_features = ['Competences_c', 'Knowledge_c', 'Languages_c',
                        'Competences_j', 'Knowledge_j', 'Languages_j',
                        # 'Age_j'
                        ]
    cols_pred = ['Education_c',
                 'Age_c',
                 'Gender_c',
       'Contract_c', 'Nationality_c', 'Competences_c', 'Knowledge_c',
       'Languages_c', 'Experience_c', 'Education_j',
                 'Age_j_min', 'Age_j_max', # 'Age_j',
       'Gender_j', 'Contract_j', 'Nationality_j', 'Competences_j',
       'Knowledge_j', 'Languages_j', 'Experience_j']

    # continuous_features + categorical_features
    return {'outcome_name_col': outcome_name_col, 'continuous_features': continuous_features,
            'categorical_features': categorical_features, "setlist_features": setlist_features,
            'cols_pred': cols_pred,
            'cols_id': cols_id, 'cols_sensitive': cols_sensitive, 'col_target': col_target, 'col_rank': col_rank,
            'cols_pred_preprocess': cols_pred_preprocess, 'cols_not_for_pred': cols_not_for_pred}

def define_cols_dict_FEDD():
    """
    Define the columns of the FEDD dataset

    Returns:
    ----------
    cols_dict_FEDD: dict
        Dictionary with the columns of the FEDD dataset
    """

    outcome_name_col = 'lambda'  # 'pred_rank'
    continuous_features = ['fitness_Languages', 'fitness_Competences',
                           'fitness_Knowledge']  # ['Age_c', 'Experience_c'],
    categorical_features = ['fitness_Contract', 'fitness_Nationality', 'fitness_Education', 'fitness_Experience',
                            'fitness_Age',
                            'fitness_Gender']
    cols_pred = continuous_features + categorical_features

    cols_id = ['qId', 'kId']  # ids
    cols_sensitive = ['Gender_c']  # sensitive attribute(s)
    col_target = 'score'  # target value for ranking
    col_rank = 'rank'  # rank value for ranking

    return {'outcome_name_col': outcome_name_col, 'continuous_features': continuous_features,
            'categorical_features': categorical_features, 'cols_pred': cols_pred,
            'cols_id': cols_id, 'cols_sensitive': cols_sensitive, 'col_target': col_target, 'col_rank': col_rank}

In [35]:
class SuperRankerPipeline:

    def __init__(self, pipeline, ranker, cols_dict_FEDD): # steps, *, memory=None, verbose=False):
        self.pipeline = pipeline
        self.ranker = ranker
        self.cols_dict_FEDD = cols_dict_FEDD

    def predict(self, X):
        # params_pipeline = {k: v for k, v in params.items() if k in self.pipeline.get_params().keys()}
        _intermediate = self.pipeline.transform(X)
        # print('_intermediate', _intermediate)
        # _intermediate.drop(columns=['qId', 'kId'], inplace=True)
        return self.ranker.predict(_intermediate[self.cols_dict_FEDD['cols_pred']])

In [36]:
def extract_explicand_data_cf(job_id, exp_c_pred_rank, pipeline_fitness, df_CDS_JDS):
    # df_qId contains the data for the job qId

    # Isolate the candidates' profiles applying for the job qId
    df_qId_HUDD = df_CDS_JDS[df_CDS_JDS['qId'] == job_id]

    # Extract the explicand candidate kId
    exp_c_kId = df_qId_HUDD.loc[df_qId_HUDD['pred_rank'] == exp_c_pred_rank, 'kId'].iloc[0]

    # Isolate the explicand candidate profile
    exp_c_profile = df_CDS_JDS[df_CDS_JDS['kId'] == exp_c_kId]

    exp_c_fitness = pipeline_fitness.transform(exp_c_profile)

    exp_c = {'kId': exp_c_kId, 'profile': exp_c_profile, 'fitness': exp_c_fitness}

    return df_qId_HUDD, exp_c


def prepare_data_cf(df_qId_HUDD, cols_dict):
    # Convert data types
    df_qId_HUDD_pre = df_qId_HUDD[cols_dict['cols_pred']].copy(deep=True) #.astype('int').copy(deep=True)
    df_qId_HUDD_pre[cols_dict['cols_id']] = df_qId_HUDD[cols_dict['cols_id']]
    #df_qId_HUDD_pre[cols_dict['categorical_features']].copy(deep=True) #.astype('int').copy(deep=True)
    # df_qId_HUDD_pre[cols_dict['continuous_features']] = df_qId_HUDD[cols_dict['continuous_features']].astype(
    #    'float').copy(deep=True)
    df_qId_HUDD_pre[cols_dict['outcome_name_col']] = df_qId_HUDD[cols_dict['outcome_name_col']].copy(deep=True)
    feature_dtypes = None # {col: df_qId_HUDD_pre[col].dtype for col in df_qId_HUDD_pre[cols_dict['cols_pred']].columns}

    return df_qId_HUDD_pre, feature_dtypes


def define_target(args, df_qId_HUDD):
    # 'in_top_k' or 'out_top_k' depending on the candidate position
    explicand_class = 'in_top_k' if args.candidate_position <= MacroVariables.TOP_K else 'out_top_k'

    # target rank for counterfactual explanation
    if args.target_rank:
        tgt_cf_rank = args.target_rank
        tgt_cf_score = df_qId_HUDD[df_qId_HUDD['pred_rank'] == tgt_cf_rank]['score'].iloc[0]
        tgt_cf_candidate = df_qId_HUDD[df_qId_HUDD['pred_rank'] == tgt_cf_rank]

    elif args.target_score:
        tgt_cf_rank = None
        tgt_cf_score = args.target_score
        tgt_cf_candidate = None
    else:
        raise ValueError('Either target rank or target score must be provided')

    return explicand_class, tgt_cf_rank, tgt_cf_score, tgt_cf_candidate


## Build the preprocessing pipeline

In [37]:
def build_matching_functions():
    # Matching functions for pairs of job-candidate features
    maps_matching = {
         # MatchBinary: 1 = job value = candidate value OR job value is 'Any' OR candidate value is 'Any', 0 = otherwise
        # (('qId',), ('qId',)): IdentityMapping(),
        # (('kId',), ('kId',)): IdentityMapping(),
        # (('rank',), ('rank',)): IdentityMapping(),
        (('Contract_j', 'Contract_c'), ('fitness_Contract',)): MatchBinary(),
        (('Gender_j', 'Gender_c'), ('fitness_Gender',)): MatchBinary(),
        (('Nationality_j', 'Nationality_c'), ('fitness_Nationality',)): MatchBinary(),

         # MatchOrdinal: 1 = job value >= candidate OR job value is 'Any', 0 = otherwise
        (('Education_j', 'Education_c'), ('fitness_Education',)): MatchOrdinal(),
        (('Experience_j', 'Experience_c'), ('fitness_Experience',)): MatchOrdinal(),

         # MatchFeatureInclusion: 1 = candidate value in (job value(0,), >= job value(1,)) OR job value is 'Any', 0 = otherwise
        (('Age_j_min', 'Age_j_max', 'Age_c'), ('fitness_Age',)): MatchFeatureInclusionSeparated(),

         # MatchFeatureSet: 1 = fraction of job value that appear in candidate value
        (('Languages_j', 'Languages_c'), ('fitness_Languages',)): MatchFeatureSet(),
        (('Competences_j', 'Competences_c'), ('fitness_Competences',)): MatchFeatureSet(),
        (('Knowledge_j', 'Knowledge_c'), ('fitness_Knowledge',)): MatchFeatureSet()
    }
    return maps_matching


def build_fitness_matrix(df_CDS_JDS, cols_dict, fair_data=True):
    """
    Build the fitness matrix

    Parameters:
    ----------
    df_CDS_JDS: pd.DataFrame
        The dataset CDS_JDS
    cols_dict: dict
        Dictionary with the columns of the HUDD dataset
    fair_data: bool
        Whether the data is fair or unfair

    Returns:
    ----------
    pipeline_fitness: sklearn.pipeline.Pipeline
        The pipeline to calculate the fitness matrix
    df_fitness_mat: pd.DataFrame
        The fitness matrix
    """
    maps_matching = build_matching_functions()

    # Calculation as fit-transform preprocessing
    pipeline_fitness = Pipeline(steps=[
        ("init", AttachMetadata(md_CDS_JDS_ADS)),
        ("matching", DerivedColumn(maps_matching)),
        ("end", DetachMetadata())
    ])

    pipeline_fitness.fit(X=df_CDS_JDS)
    fitness_matrix = pipeline_fitness.transform(X=df_CDS_JDS)
    df_fitness_mat = fitness_matrix.copy(deep=True)
    columns_keep = cols_dict['cols_id'] + \
                   [col for col in fitness_matrix if
                    col.startswith('fitness_')] + cols_dict['cols_sensitive'] + cols_dict['col_target']

    df_fitness_mat = df_fitness_mat[columns_keep]

    # From scores, we can learn regressors; or we can produce ranks, and learn ranking models
    df_fitness_mat['rank'] = df_fitness_mat.groupby("qId")['score'].rank('dense', ascending=False)
    df_fitness_mat['rank'] = df_fitness_mat['rank'].apply(lambda x: x if x <= MacroVariables.TOP_K else MacroVariables.TOP_K + 1)

    return pipeline_fitness, df_fitness_mat


def build_pipeline_fitness(df_CDS_JDS):
    """
    Build the pipeline to calculate the fitness matrix

    Parameters:
    ----------
    df_CDS_JDS: pd.DataFrame
        The dataset CDS_JDS

    Returns:
    ----------
    pipeline_fitness: sklearn.pipeline.Pipeline
        The pipeline to calculate the fitness matrix
    fitness_matrix: pd.DataFrame
        The fitness matrix
    cols_dict_FEDD: dict
        Dictionary with the columns of the FEDD dataset
    """
    maps_matching = build_matching_functions()

    # Calculation as fit-transform preprocessing
    pipeline_fitness = Pipeline(steps=[
        ("init", AttachMetadata(md_CDS_JDS_ADS)),
        ("matching", DerivedColumn(maps_matching)),
        ("end", DetachMetadata())
    ])

    fitness_matrix = pipeline_fitness.fit_transform(X=df_CDS_JDS)


    cols_dict_FEDD = define_cols_dict_FEDD()
    return pipeline_fitness, fitness_matrix, cols_dict_FEDD




In [38]:
def transform_split(df_train_HUDD, df_val_HUDD, df_test_HUDD, pipeline_fitness, cols_dict_HUDD):
    """
    Transform the HUDD datasets into FEDD datasets by applying the fitness pipeline

    Parameters:
    df_train_HUDD: pd.DataFrame
        Training set of the HUDD dataset
    df_val_HUDD: pd.DataFrame
        Validation set of the HUDD dataset
    df_test_HUDD: pd.DataFrame
        Test set of the HUDD dataset
    pipeline_fitness: sklearn.pipeline.Pipeline
        Pipeline to calculate the fitness matrix
    cols_dict_HUDD: dict
        Dictionary with the columns of the HUDD dataset

    Returns:
    df_train_FEDD: pd.DataFrame
        Training set of the FEDD dataset
    df_val_FEDD: pd.DataFrame
        Validation set of the FEDD dataset
    df_test_FEDD: pd.DataFrame
        Test set of the FEDD dataset
    """
    df_train_FEDD = pipeline_fitness.transform(df_train_HUDD)
    df_train_FEDD.reset_index(drop=True, inplace=True)
    print('cols_dict_HUDD', cols_dict_HUDD)

    df_train_FEDD[cols_dict_HUDD['cols_id']] = df_train_HUDD[cols_dict_HUDD['cols_id']].values
    df_train_FEDD[cols_dict_HUDD['col_rank']] = df_train_HUDD[cols_dict_HUDD['col_rank']].values

    df_val_FEDD = pipeline_fitness.transform(df_val_HUDD)
    df_val_FEDD.reset_index(inplace=True)
    df_val_FEDD[cols_dict_HUDD['cols_id']] = df_val_HUDD[cols_dict_HUDD['cols_id']].values
    df_val_FEDD[cols_dict_HUDD['col_rank']] = df_val_HUDD[cols_dict_HUDD['col_rank']].values

    df_test_FEDD = pipeline_fitness.transform(df_test_HUDD)
    df_test_FEDD.reset_index(drop=True, inplace=True)
    df_test_FEDD[cols_dict_HUDD['cols_id']] = df_test_HUDD[cols_dict_HUDD['cols_id']].values
    df_test_FEDD[cols_dict_HUDD['col_rank']] = df_test_HUDD[cols_dict_HUDD['col_rank']].values

    return df_train_FEDD, df_val_FEDD, df_test_FEDD

In [39]:
def train(ranker, df_train, df_val, cols_dict):
    df_train_counts = df_train.groupby("qId")["qId"].count().to_numpy()
    df_val_counts = df_val.groupby("qId")["qId"].count().to_numpy()

    # Fitting ranker:
    ranker.fit(
        X=df_train[cols_dict['cols_pred']],
        # LightGBM relevance is the higher the better
        y=rank2relevance(df_train, MacroVariables.TOP_K, cols_dict['col_rank']),
        group = df_train_counts,
        eval_at = [MacroVariables.TOP_K],
        # LightGBM relevance is the higher the better
        eval_set =[(df_val[cols_dict['cols_pred']], rank2relevance(df_val, MacroVariables.TOP_K, cols_dict['col_rank']))],
        eval_group =[df_val_counts]
    )

    return ranker


def evaluate(ranker, df_eval, cols_dict):
    df_test_counts = df_eval.groupby("qId")["qId"].count().to_numpy()
    # Predicting ranker:
    df_eval['lambda'] = ranker.predict(df_eval[cols_dict['cols_pred']])
    df_eval['pred_rank'] = df_eval.groupby("qId")['lambda'].rank('dense', ascending=False)
    df_eval['pred_rank'] = df_eval['pred_rank'].apply(lambda x: x if x <= MacroVariables.TOP_K else MacroVariables.TOP_K + 1)

    return df_eval


def ranking_pipeline(df_train_FEDD, df_val_FEDD, df_test_FEDD, cols_dict_FEDD):
    pipeline_fitness.transform(df_train_HUDD)
    # Define the ranking model
    ranker = LGBMRanker(
        objective="lambdarank",
        class_weight="balanced",
        boosting_type="gbdt",
        importance_type="gain",
        learning_rate=0.1,
        n_estimators=100,
        force_row_wise=True,
        n_jobs=-1,  # max parallelism
        verbose=-1  # no verbosity
    )

    ranker = train(ranker, df_train_FEDD, df_val_FEDD, cols_dict_FEDD)
    df_train_FEDD = evaluate(ranker, df_train_FEDD, cols_dict_FEDD)
    df_val_FEDD = evaluate(ranker, df_val_FEDD, cols_dict_FEDD)
    df_test_FEDD = evaluate(ranker, df_test_FEDD, cols_dict_FEDD)

    return ranker, df_train_FEDD, df_val_FEDD, df_test_FEDD

def attach_predictions(df_CDS_JDS, ranker, pipeline_fitness, cols_dict_FEDD):
    df_CDS_JDS['lambda'] = ranker.predict(pipeline_fitness.transform(df_CDS_JDS)[cols_dict_FEDD['cols_pred']])
    df_CDS_JDS['pred_rank'] = df_CDS_JDS.groupby("qId")['lambda'].rank('dense', ascending=False)
    return df_CDS_JDS

In [40]:
def define_explainer_HUDD(pipeline_fitness, ranker, df_qId_HUDD_pre, cols_dict_HUDD, cols_dict_FEDD, feature_dtypes, explanation_method):

    super_pipeline_model = SuperRankerPipeline(pipeline_fitness, ranker, cols_dict_FEDD)

    data_dice = dice_ml.Data(dataframe=df_qId_HUDD_pre[cols_dict_HUDD['cols_pred'] + [cols_dict_HUDD['outcome_name_col']]],
                             continuous_features=cols_dict_HUDD['continuous_features'],
                             categorical_features=cols_dict_HUDD['categorical_features'],
                             setlist_features=cols_dict_HUDD['setlist_features'],
                             outcome_name=cols_dict_HUDD['outcome_name_col'])

    kwargs = {'top_k': MacroVariables.TOP_K, 'features_dtype': feature_dtypes}

    model_dice = dice_ml.Model(model=super_pipeline_model,
                               backend={'explainer': 'dice_xgboost.DiceGenetic',
                                        'model': "lgbmranker_pipeline_model.LGBMRankerPipelineModel"},
                               model_type="regressor",
                               # model_type="classifier",
                               kw_args=kwargs)

    explainer = dice_ml.Dice(data_dice, model_dice, method=explanation_method)

    return explainer, data_dice, model_dice


def get_explanations_HUDD(df_qId_HUDD, exp_c, cols_dict_cf, explainer):
    c_th_lambda = df_qId_HUDD[df_qId_HUDD['pred_rank'] == MacroVariables.TOP_K].iloc[0]['lambda']
    explanations = explainer.generate_counterfactuals(exp_c['profile'][
                                                          # cols_dict_cf['cols_id'] +
                                                          cols_dict_cf['cols_pred']],
                                                      total_CFs=10,
                                                      desired_range=[c_th_lambda, 100],
                                                      # desired_class="opposite",
                                                      verbose=True)
    return explanations

## Define Hyperparameters for the explanation

In [41]:
# The job id for which the counterfactual explanation is to be generated'
# Valid values 160-199
job_id = 162

# The position of the candidate in the ranked list
candidate_position = 15

# Alternative ways to define the target of the explanation
target_rank = MacroVariables.TOP_K
target_score = 0.9

explanation_method = 'genetic'

In [42]:
args = namedtuple('Args', ['target_rank', 'target_score', 'target_candidate'])

args.target_rank = target_rank
args.target_score = target_score
args.candidate_position = candidate_position

df_CDS_JDS, cols_dict_HUDD = load_dataset(fair_data=MacroVariables.FAIR_DATA)
pipeline_fitness, df_fitness_mat, cols_dict_FEDD = build_pipeline_fitness(df_CDS_JDS)
df_train_HUDD, df_val_HUDD, df_test_HUDD = data_split(df_CDS_JDS)
df_train_FEDD, df_val_FEDD, df_test_FEDD = transform_split(df_train_HUDD, df_val_HUDD, df_test_HUDD, pipeline_fitness, cols_dict_HUDD)

ranker, df_train_FEDD, df_val_FEDD, df_test_FEDD = ranking_pipeline(df_train_FEDD, df_val_FEDD, df_test_FEDD, cols_dict_FEDD)
df_CDS_JDS = attach_predictions(df_CDS_JDS, ranker, pipeline_fitness, cols_dict_FEDD)

df_qId_HUDD, exp_c,  = extract_explicand_data_cf(job_id=job_id, exp_c_pred_rank=args.candidate_position,
                                                 pipeline_fitness=pipeline_fitness, df_CDS_JDS=df_CDS_JDS)

df_qId_HUDD_pre, feature_dtypes = prepare_data_cf(df_qId_HUDD, cols_dict_HUDD)

explicand_class, tgt_cf_rank, tgt_cf_score, tgt_cf_candidate = define_target(args, df_qId_HUDD)

explainer, data_dice, model_dice = define_explainer_HUDD(pipeline_fitness, ranker, df_qId_HUDD_pre,
                                                         cols_dict_HUDD, cols_dict_FEDD, feature_dtypes,
                                                         explanation_method=explanation_method)

print('Counterfactual Explanations:')
explanations_HUDD = get_explanations_HUDD(df_qId_HUDD, exp_c, cols_dict_HUDD, explainer)




cols_dict_HUDD {'outcome_name_col': 'lambda', 'continuous_features': ['Age_c', 'Experience_c', 'Experience_j', 'Age_j_min', 'Age_j_max'], 'categorical_features': ['Education_c', 'Gender_c', 'Contract_c', 'Nationality_c', 'Education_j', 'Gender_j', 'Contract_j', 'Nationality_j'], 'setlist_features': ['Competences_c', 'Knowledge_c', 'Languages_c', 'Competences_j', 'Knowledge_j', 'Languages_j'], 'cols_pred': ['Education_c', 'Age_c', 'Gender_c', 'Contract_c', 'Nationality_c', 'Competences_c', 'Knowledge_c', 'Languages_c', 'Experience_c', 'Education_j', 'Age_j_min', 'Age_j_max', 'Gender_j', 'Contract_j', 'Nationality_j', 'Competences_j', 'Knowledge_j', 'Languages_j', 'Experience_j'], 'cols_id': ['qId', 'kId'], 'cols_sensitive': ['Gender_c'], 'col_target': ['score'], 'col_rank': ['rank'], 'cols_pred_preprocess': ['Education_c', 'Age_c', 'Gender_c', 'Contract_c', 'Nationality_c', 'Competences_c', 'Knowledge_c', 'Languages_c', 'Experience_c', 'Education_j', 'Age_j', 'Gender_j', 'Contract_j', '

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[list(output_cols)] = X_new[list(output_cols)].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[list(output_cols)] = X_new[list(output_cols)].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[list(output_cols)] = X_new[list(output_cols)].values
A value is trying to be set on a copy of

Counterfactual Explanations:


  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
 

Initializing initial parameters to the genetic algorithm...


  feature_weights_list.append(round(1 / self.feature_range[feature].max(), 2))


Initialization complete! Generating counterfactuals...


100%|██████████| 1/1 [00:04<00:00,  4.71s/it]

Diverse Counterfactuals found! total time taken: 00 min 04 sec





In [None]:
# final_cfs_df = pd.read_csv('final_cfs_df_HUDD.csv', index_col=0)

In [None]:
# final_cfs_df

In [None]:
# exp_c_profile = pd.read_csv('prof.csv', index_col=0)
# exp_c_profile

In [43]:
print(explanations_HUDD.visualize_as_dataframe())

Query instance (original outcome : -2.161700963973999)


Unnamed: 0,Education_c,Age_c,Gender_c,Contract_c,Nationality_c,Competences_c,Knowledge_c,Languages_c,Experience_c,Education_j,Age_j_min,Age_j_max,Gender_j,Contract_j,Nationality_j,Competences_j,Knowledge_j,Languages_j,Experience_j,lambda
0,No education,25,Male,In presence,Spanish,"(manage vessel control systems, operate vessel...","(engine components, electrical systems used in...",(),0,Any,20,25,Any,Remote,Any,"(operate vessel engine room, manage vessel con...","(mechanics, inland waterway ship building, eng...",(),1,-2.161701



Diverse Counterfactual set (new outcome: [np.float64(-0.87330017212292), 100])


Unnamed: 0,Education_c,Age_c,Gender_c,Contract_c,Nationality_c,Competences_c,Knowledge_c,Languages_c,Experience_c,Education_j,Age_j_min,Age_j_max,Gender_j,Contract_j,Nationality_j,Competences_j,Knowledge_j,Languages_j,Experience_j,lambda
0,No education,25,Male,In presence,Spanish,"(have computer literacy, act reliably, manage ...","(inland waterway ship building, electrical sys...",(),0,Any,20,25,Any,Remote,Any,"(manage vessel control systems, operate vessel...","(inland waterway ship building, electrical sys...",(),1,-0.652254
0,No education,25,Male,In presence,Spanish,"(have computer literacy, inspect engine rooms,...","(electrical machines, electrical systems used ...",(),0,Any,20,25,Any,Remote,Any,"(manage vessel control systems, operate vessel...","(inland waterway ship building, electrical sys...",(),1,-0.652254
0,No education,25,Female,In presence,Spanish,"(manage vessel control systems, perform servic...","(mechanics of vessels, electrical systems used...",(),0,Any,20,25,Any,Remote,Any,"(manage vessel control systems, operate vessel...","(inland waterway ship building, electrical sys...",(),1,-0.652254
0,No education,25,Male,In presence,Swedish,"(manage vessel control systems, execute safety...","(inland waterway ship building, electrical sys...",(),0,Any,20,25,Any,Remote,Any,"(manage vessel control systems, operate vessel...","(inland waterway ship building, electrical sys...",(),1,-0.652254
0,No education,25,Non-binary,In presence,Spanish,"(detect malfunctions in engines, manage vessel...","(inland waterway ship building, electrical sys...",(),0,Any,20,25,Any,Remote,Any,"(manage vessel control systems, operate vessel...","(inland waterway ship building, electrical sys...",(),1,-0.652254
0,No education,25,Female,In presence,Spanish,"(detect malfunctions in engines, manage vessel...","(international waterways, electrical systems u...",(),0,Any,20,25,Any,Remote,Any,"(manage vessel control systems, operate vessel...","(inland waterway ship building, electrical sys...",(),1,-0.652254
0,No education,25,Male,In presence,Swedish,"(have computer literacy, act reliably, manage ...","(inland waterway ship building, electrical sys...",(),0,Any,20,25,Any,Remote,Any,"(manage vessel control systems, operate vessel...","(inland waterway ship building, electrical sys...",(),1,-0.652254
0,No education,25,Non-binary,In presence,Spanish,"(manage vessel control systems, work in a wate...","(european classification of inland waterways, ...",(),0,Any,20,25,Any,Remote,Any,"(manage vessel control systems, operate vessel...","(inland waterway ship building, electrical sys...",(),1,-0.652254
0,PhD,25,Male,In presence,Spanish,"(manage vessel control systems, perform servic...","(mechanics of vessels, electrical systems used...",(),0,Any,20,25,Any,Remote,Any,"(manage vessel control systems, operate vessel...","(inland waterway ship building, electrical sys...",(),1,-0.652254
0,No education,25,Male,Hybrid,Spanish,"(manage vessel control systems, perform servic...","(mechanics of vessels, electrical systems used...",(),0,Any,20,25,Any,Remote,Any,"(manage vessel control systems, operate vessel...","(inland waterway ship building, electrical sys...",(),1,-0.652254


None
