
# Analyze Errors and Explore Interpretability of Models

This notebook demonstrates how to use the Responsible AI Widget's Error Analysis dashboard to understand a model trained on the Census dataset. The goal of this sample notebook is to classify income greater or less than 50K with scikit-learn and explore model errors and explanations:

1. Train a LightGBM classification model using Scikit-learn
2. Run Interpret-Community's 'explain_model' globally and locally to generate model explanations.
3. Visualize model errors and global and local explanations with the Error Analysis visualization dashboard.

## Install Required Packages

## Explain

### Run model explainer at training time

In [1]:
#from sklearn import svm
import pandas as pd
 

### Load the UCI adult census income dataset

In [2]:
import zipfile
from raiutils.dataset import fetch_dataset
outdirname = 'erroranalysis.12.3.20'
zipfilename = outdirname + '.zip'

fetch_dataset('https://publictestdatasets.blob.core.windows.net/data/' + zipfilename, zipfilename)

with zipfile.ZipFile(zipfilename, 'r') as unzip:
    unzip.extractall('.')

Dataset download attempt 1 of 4


In [2]:
train_data = pd.read_csv('adult-train.csv', skipinitialspace=True)
test_data = pd.read_csv('adult-test.csv', skipinitialspace=True)

In [3]:
from sklearn.model_selection import train_test_split
test_data_full = test_data
test_data, _ = train_test_split(test_data, test_size=0.9, random_state=7)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

def split_label(dataset):
    X = dataset.drop(['income'], axis=1)
    y = dataset[['income']]
    return X, y

def clean_data(X, y):
    features = X.columns.values.tolist()
    classes = y['income'].unique().tolist()
    pipe_cfg = {
        'num_cols': X.dtypes[X.dtypes == 'int64'].index.values.tolist(),
        'cat_cols': X.dtypes[X.dtypes == 'object'].index.values.tolist(),
    }
    num_pipe = Pipeline([
        ('num_imputer', SimpleImputer(strategy='median')),
        ('num_scaler', StandardScaler())
    ])
    cat_pipe = Pipeline([
        ('cat_imputer', SimpleImputer(strategy='constant', fill_value='?')),
        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])
    feat_pipe = ColumnTransformer([
        ('num_pipe', num_pipe, pipe_cfg['num_cols']),
        ('cat_pipe', cat_pipe, pipe_cfg['cat_cols'])
    ])
    X = feat_pipe.fit_transform(X)
    return X, feat_pipe, features, classes

X_train_original, y_train = split_label(train_data)
X_train, feat_pipe, features, classes = clean_data(X_train_original, y_train)
X_test_original, y_test = split_label(test_data)
c, y_test_full = split_label(test_data_full)
y_test = y_test['income'].to_numpy()
y_test_full = y_test_full['income'].to_numpy()
X_test = feat_pipe.transform(X_test_original)
features = train_data.columns.values[1:].tolist()
classes = y_train['income'].unique().tolist()
categorical_features = train_data.dtypes[train_data.dtypes == 'object'].index.values[1:].tolist()

In [7]:
2

2

### Train a LightGBM classification model, which you want to analyze

In [9]:
from lightgbm import LGBMClassifier
clf = LGBMClassifier(n_estimators=1)
model = clf.fit(X_train, y_train['income'])

In [10]:
y_pred = model.predict(X_test)

In [11]:
y_pred

array(['<=50K', '<=50K', '<=50K', ..., '<=50K', '<=50K', '<=50K'],
      dtype=object)

In [12]:
y_pred_prob = model.predict_proba(X_test)[:,1]

In [13]:
y_pred_prob.min()

0.2196878584101381

In [14]:
y_test

array(['<=50K', '>50K', '<=50K', ..., '>50K', '>50K', '>50K'],
      dtype=object)

In [15]:
import numpy as np
value_to_map = '>50K'

# Map the desired values to 1 and 0 using the 'where' function
y_test_prob = np.where(y_test == value_to_map, 1, 0)



In [16]:
y_test_prob

array([0, 1, 0, ..., 1, 1, 1])

In [17]:
from sklearn.metrics import average_precision_score

In [18]:
average_precision_score(y_test_prob, y_pred_prob)

0.7430570380105718

In [None]:
from sklearn.metrics import average_precision_score

y_pred_prob = model.predict_proba(X_test)[:,1]
average_precision_score(y_test, y_pred_prob)

In [43]:
X_test_original['truth'] = y_test_prob

In [44]:
X_test_original['pred'] = y_pred_prob

In [45]:
X_test_original

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,truth,pred
1998,42,State-gov,172307,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,0,0.283190
3590,61,Local-gov,205711,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,30,United-States,1,0.232571
1156,48,?,117054,5th-6th,3,Divorced,?,Not-in-family,White,Male,0,0,99,United-States,0,0.219688
3156,62,Private,186446,Some-college,10,Divorced,Tech-support,Unmarried,White,Female,0,0,43,United-States,0,0.219688
14403,29,Private,197382,11th,7,Never-married,Craft-repair,Not-in-family,White,Male,0,0,45,United-States,0,0.219688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5699,34,Private,93213,Masters,14,Married-civ-spouse,Other-service,Husband,White,Male,0,0,30,United-States,0,0.245515
10742,31,Private,323985,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,1,0.242416
537,44,Private,98779,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,5178,0,40,United-States,1,0.322048
9412,43,Federal-gov,205675,Some-college,10,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,75,United-States,1,0.264401


In [53]:
feature = 'education-num'
for i in X_test_original[feature].unique():
    df_temp =  X_test_original[X_test_original[feature]==i]
    ap = average_precision_score(df_temp['truth'], df_temp['pred'])
    print(i,': ',ap,', perc:',round(100*len(df_temp)/len(X_test_original),2),'%' )

10 :  0.6575654818146476 , perc: 22.73 %
9 :  0.536351751600797 , perc: 32.49 %
3 :  0.07692307692307693 , perc: 1.17 %
7 :  0.42857142857142855 , perc: 4.12 %
11 :  0.5584869561915257 , perc: 5.1 %
4 :  0.05 , perc: 2.09 %
16 :  0.9603174603174603 , perc: 0.86 %
2 :  0.1111111111111111 , perc: 0.74 %
14 :  0.8282728866783683 , perc: 5.59 %
13 :  0.786333176980377 , perc: 15.17 %
12 :  0.7542486649832347 , perc: 3.44 %
15 :  0.9773559773559775 , perc: 1.72 %
5 :  0.13333333333333333 , perc: 1.6 %
6 :  0.13333333333333333 , perc: 2.27 %
8 :  0.07692307692307693 , perc: 0.8 %
1 :  nan , perc: 0.12 %


invalid value encountered in true_divide


In [59]:


feature = 'education-num'
results = []
base_score = average_precision_score(X_test_original['truth'], X_test_original['pred'])

for i in X_test_original[feature].unique():
    df_temp = X_test_original[X_test_original[feature] == i]
    ap = average_precision_score(df_temp['truth'], df_temp['pred'])
    percent_row = round(100 * len(df_temp) / len(X_test_original), 2)
    results.append([feature,i, ap, round((ap - base_score)/base_score,2), percent_row])

# Create a DataFrame from the results list
df_results = pd.DataFrame(results, columns=['Feature','Feature_value', 'Average_Precision', 'Change from Base Score', 'Percent Row'])

df_results.sort_values('Average_Precision')


invalid value encountered in true_divide


Unnamed: 0,Feature,Feature_value,Average_Precision,Change from Base Score,Percent Row
5,education-num,4,0.05,-0.93,2.09
2,education-num,3,0.076923,-0.9,1.17
14,education-num,8,0.076923,-0.9,0.8
7,education-num,2,0.111111,-0.85,0.74
12,education-num,5,0.133333,-0.82,1.6
13,education-num,6,0.133333,-0.82,2.27
3,education-num,7,0.428571,-0.42,4.12
1,education-num,9,0.536352,-0.28,32.49
4,education-num,11,0.558487,-0.25,5.1
0,education-num,10,0.657565,-0.12,22.73


In [76]:
X_test_original.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'truth', 'pred'],
      dtype='object')

In [90]:
X_test_original['truth'] = y_test_prob
X_test_original['pred'] = model.predict_proba(X_test)[:,1]

base_score = average_precision_score(X_test_original['truth'], X_test_original['pred'])
features= ['workclass']


# Define a custom aggregate function to calculate accuracy
def calculate_map(labels,base):
    ap = average_precision_score(labels['truth'], labels['pred'])
    percent_rows = round(100 * len(labels) / len(test_data),1)
    change_base = round(100*(ap - base_score)/base_score,1)
    return pd.Series({'MAP': ap, '%change_MAP': change_base,'Percent_Rows': percent_rows}) 

# Group the data based on the desired feature and calculate accuracy for each group

additional_args = {'base': base_score}

grouped_data = X_test_original.groupby(features).apply(calculate_map, **additional_args)



# Convert the grouped data to a DataFrame
df_results = grouped_data.reset_index().rename(columns={'grouping_feature': 'Grouping Feature'})

df_results.sort_values('MAP')

#df_results[df_results.Percent_Rows>1]


Unnamed: 0,workclass,MAP,%change_MAP,Percent_Rows
0,?,0.498549,-32.9,5.8
2,Local-gov,0.626759,-15.7,6.6
5,Self-emp-not-inc,0.694309,-6.6,7.1
3,Private,0.741515,-0.2,69.7
6,State-gov,0.802414,8.0,4.2
1,Federal-gov,0.815607,9.8,2.9
4,Self-emp-inc,0.899445,21.0,3.6


# model function predict

In [92]:
X_test_original['truth'].sum()

385

In [136]:
X_train_original['truth'] = np.where(y_train == value_to_map, 1, 0)
X_test_original['truth'] = y_test_prob
X_test_original['pred'] = model.predict_proba(X_test)[:,1]
base_score = average_precision_score(X_test_original['truth'], X_test_original['pred'])
total_class_test = X_test_original['truth'].sum()
total_class_train = X_train_original['truth'].sum()

train_len = len(X_train_original)
test_len = len(X_test_original)

features= ['education-num']


# Define a custom aggregate function to calculate accuracy
def calculate_map(labels,df_len,base=0,total_class=1,mode='test'):
    
    percent_rows = round(100 * len(labels) / df_len,1)
    
    fails_per_group = round(100*labels['truth'].sum()/total_class,1)
    fails_mean = round(labels['truth'].mean(),4)
    if mode == 'test':
        ap = average_precision_score(labels['truth'], labels['pred'])
        change_base = round(100*(ap - base_score)/base_score,1)
        return pd.Series({'MAP': ap, '%change_MAP': change_base,'test_%Rows': percent_rows,\
                          'test_%class1_total':fails_per_group,'test_ratio_class1_group':fails_mean}) 
    else:
        return pd.Series({'train_%Rows': percent_rows,\
                          'train_%class1_total':fails_per_group,'train_ratio_class1_group':fails_mean}) 
        


# Group the data based on the desired feature and calculate accuracy for each group

additional_args_test = {'base': base_score, 'total_class' : total_class_test,'df_len': test_len}
additional_args_train = {'total_class' : total_class_train,'mode':'train','df_len': train_len}

grouped_data = X_test_original.groupby(features).apply(calculate_map, **additional_args_test)
grouped_data_train = X_train_original.groupby(features).apply(calculate_map, **additional_args_train)



# Convert the grouped data to a DataFrame
df_results = grouped_data.reset_index().sort_values('MAP')#.rename(columns={'grouping_feature': 'Grouping Feature'})
df_results_train = grouped_data_train.reset_index()
df_results_all = pd.merge(df_results,df_results_train,on=features,how='left')
df_results_all
#df_results[df_results.Percent_Rows>1]

Unnamed: 0,education-num,MAP,%change_MAP,test_%Rows,test_%class1_total,test_ratio_class1_group,train_%Rows,train_%class1_total,train_ratio_class1_group
0,4,0.05,-93.3,2.1,0.3,0.0294,2.0,0.5,0.0619
1,3,0.076923,-89.6,1.2,0.3,0.0526,1.0,0.2,0.048
2,8,0.076923,-89.6,0.8,0.3,0.0769,1.3,0.4,0.0762
3,2,0.111111,-85.0,0.7,0.3,0.0833,0.5,0.1,0.0357
4,5,0.133333,-82.1,1.6,0.5,0.0769,1.6,0.3,0.0525
5,6,0.133333,-82.1,2.3,0.5,0.0541,2.9,0.8,0.0665
6,7,0.428571,-42.3,4.1,1.8,0.1045,3.6,0.8,0.0511
7,9,0.536352,-27.8,32.5,23.6,0.172,32.3,21.4,0.1595
8,11,0.558487,-24.8,5.1,5.5,0.253,4.2,4.6,0.2612
9,10,0.657565,-11.5,22.7,16.9,0.1757,22.4,17.7,0.1902


In [238]:
X_train_original['truth'] = np.where(y_train == value_to_map, 1, 0)
X_test_original['truth'] = y_test_prob
X_test_original['pred'] = model.predict_proba(X_test)[:,1]
X_test_original['S_Score'] = normalized_similarity_scores
base_score = average_precision_score(X_test_original['truth'], X_test_original['pred'])
total_class_test = X_test_original['truth'].sum()
total_class_train = X_train_original['truth'].sum()

train_len = len(X_train_original)
test_len = len(X_test_original)

features= ['education-num']


# Define a custom aggregate function to calculate accuracy
def calculate_map(labels,df_len,base=0,total_class=1,mode='test'):
    
    percent_rows = round(100 * len(labels) / df_len,1)
    
    fails_per_group = round(100*labels['truth'].sum()/total_class,1)
    fails_mean = round(labels['truth'].mean(),4)
    if mode == 'test':
        ap = average_precision_score(labels['truth'], labels['pred'])
        change_base = round(100*(ap - base_score)/base_score,1)
        sscore_mean = round(labels['S_Score'].mean(),2)
        return pd.Series({'MAP': ap, '%change_MAP': change_base,'test_%Rows': percent_rows,\
                          'test_%class1_total':fails_per_group,'test_ratio_class1_group':fails_mean, 'S_Score_mean': sscore_mean}) 
    else:
        return pd.Series({'train_%Rows': percent_rows,\
                          'train_%class1_total':fails_per_group,'train_ratio_class1_group':fails_mean}) 
        


# Group the data based on the desired feature and calculate accuracy for each group

additional_args_test = {'base': base_score, 'total_class' : total_class_test,'df_len': test_len}
additional_args_train = {'total_class' : total_class_train,'mode':'train','df_len': train_len}

grouped_data = X_test_original.groupby(features).apply(calculate_map, **additional_args_test)
grouped_data_train = X_train_original.groupby(features).apply(calculate_map, **additional_args_train)



# Convert the grouped data to a DataFrame
df_results = grouped_data.reset_index().sort_values('MAP')#.rename(columns={'grouping_feature': 'Grouping Feature'})
df_results_train = grouped_data_train.reset_index()
df_results_all = pd.merge(df_results,df_results_train,on=features,how='left')
df_results_all
#df_results[df_results.Percent_Rows>1]

Unnamed: 0,education-num,MAP,%change_MAP,test_%Rows,test_%class1_total,test_ratio_class1_group,S_Score_mean,train_%Rows,train_%class1_total,train_ratio_class1_group
0,4,0.05,-93.3,2.1,0.3,0.0294,0.72,2.0,0.5,0.0619
1,3,0.076923,-89.6,1.2,0.3,0.0526,0.7,1.0,0.2,0.048
2,8,0.076923,-89.6,0.8,0.3,0.0769,0.8,1.3,0.4,0.0762
3,2,0.111111,-85.0,0.7,0.3,0.0833,0.79,0.5,0.1,0.0357
4,5,0.133333,-82.1,1.6,0.5,0.0769,0.77,1.6,0.3,0.0525
5,6,0.133333,-82.1,2.3,0.5,0.0541,0.8,2.9,0.8,0.0665
6,7,0.428571,-42.3,4.1,1.8,0.1045,0.78,3.6,0.8,0.0511
7,9,0.536352,-27.8,32.5,23.6,0.172,0.74,32.3,21.4,0.1595
8,11,0.558487,-24.8,5.1,5.5,0.253,0.74,4.2,4.6,0.2612
9,10,0.657565,-11.5,22.7,16.9,0.1757,0.73,22.4,17.7,0.1902


In [237]:
normalized_similarity_scores

array([0.64831804, 0.66360856, 0.70948012, ..., 0.86238532, 0.75535168,
       0.72477064])

In [236]:
similarity_scores_df['SScore']

0       0.648318
1       0.663609
2       0.709480
3       0.770642
4       0.724771
          ...   
1623    0.801223
1624    0.510703
1625    0.862385
1626    0.755352
1627    0.724771
Name: SScore, Length: 1628, dtype: float64

In [109]:
X_train_original['truth'] = np.where(y_train == value_to_map, 1, 0)

In [115]:
X_train_original.groupby(features).agg({'truth':[np.mean]}).reset_index()

Unnamed: 0_level_0,education-num,truth
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
0,1,0.0
1,2,0.035714
2,3,0.048048
3,4,0.06192
4,5,0.052529
5,6,0.066452
6,7,0.051064
7,8,0.076212
8,9,0.159509
9,10,0.190235


In [142]:
X_train_original.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'truth'],
      dtype='object')

In [143]:
sel = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

In [148]:
type(X_train_original.iloc[1,1])

str

In [145]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import hamming

# Assuming you have preprocessed training data 'X_train'
# Assuming you have test data 'X_test'
# Assuming you have feature weights in 'feature_weights'
# Assuming you have feature importances in 'feature_importances'

# Initialize the NearestNeighbors model
k = 5  # Number of nearest neighbors to consider

# Define a custom distance function based on your categorical features
def custom_distance(x1, x2):
    # Assuming x1 and x2 are instances with categorical features

    # Compute the distance based on your preference
    distance = 0
    for i in range(len(x1)):
        if isinstance(x1[i], str) and isinstance(x2[i], str):
            # Handle categorical features
            distance += (x1[i] != x2[i])  # Hamming distance
        else:
            # Handle numerical features
            distance += abs(x1[i] - x2[i])  # Euclidean distance or other distance metric
    
    return distance

nn_model = NearestNeighbors(n_neighbors=k, metric=custom_distance)

# Fit the model with the training data
nn_model.fit(X_train_original[sel])

# Find the distances and indices of the k nearest neighbors for each instance in the test data
distances, indices = nn_model.kneighbors(X_test_origina[sel])

# Calculate the weighted similarity scores for each instance
weighted_similarity_scores = 1 - (distances.mean(axis=1) / distances.max()) #* feature_importances

# Normalize the weighted similarity scores to ensure they are between 0 and 1
normalized_similarity_scores = (weighted_similarity_scores - weighted_similarity_scores.min()) / (weighted_similarity_scores.max() - weighted_similarity_scores.min())

# Convert the normalized similarity scores to a DataFrame
similarity_scores_df = pd.DataFrame({'Similarity Score': normalized_similarity_scores})
similarity_scores_df
# Concatenate the similarity scores DataFrame with the original test data
#test_data_with_similarity = pd.concat([X_test, similarity_scores_df], axis=1)

# Print the test data with the added similarity score column
#print(test_data_with_similarity)


ValueError: could not convert string to float: 'State-gov'

In [158]:
#import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import hamming

import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import hamming

# Assuming you have preprocessed training data 'X_train'
# Assuming you have test data 'X_test'
# Assuming you have feature weights in 'feature_weights'
# Assuming you have feature importances in 'feature_importances'

# Initialize the NearestNeighbors model
k = 5  # Number of nearest neighbors to consider

# Define a custom distance function based on your categorical features
def custom_distance(x1, x2):
    # Assuming x1 and x2 are instances with categorical features

    # Compute the distance based on your preference
    distance = 0
    for i in range(len(x1)):
        if isinstance(x1[i], str) and isinstance(x2[i], str):
            # Handle categorical features
            distance += (x1[i] != x2[i])  # Hamming distance
        else:
            # Handle numerical features
            distance += abs(x1[i] - x2[i])  # Euclidean distance or other distance metric
    
    return distance



nn_model = NearestNeighbors(n_neighbors=k, metric=custom_distance)

# Fit the model with the encoded training data
nn_model.fit(X_train_original[sel])

# Find the distances and indices of the k nearest neighbors for each instance in the encoded test data
distances, indices = nn_model.kneighbors(X_test_original[sel])

# Calculate the weighted similarity scores for each instance
weighted_similarity_scores = 1 - (distances.mean(axis=1) / distances.max()) * feature_importances

# Normalize the weighted similarity scores to ensure they are between 0 and 1
normalized_similarity_scores = (weighted_similarity_scores - weighted_similarity_scores.min()) / (weighted_similarity_scores.max() - weighted_similarity_scores.min())

# Convert the normalized similarity scores to a DataFrame
similarity_scores_df = pd.DataFrame({'Similarity Score': normalized_similarity_scores})

similarity_scores_df


ValueError: could not convert string to float: 'State-gov'

In [160]:
X_train_original.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
gender            object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
truth              int32
dtype: object

In [161]:
data = X_train_original[sel]

In [163]:
from sklearn.preprocessing import MinMaxScaler

# Assuming you have a DataFrame 'data' with both numerical and categorical features

# Find the categorical features that are not numerical
categorical_features = [col for col in data.columns if col not in data.select_dtypes(include=['number']).columns]

# One-hot encode the categorical features
encoded_categorical = pd.get_dummies(data[categorical_features])

# Normalize the numerical features
scaler = MinMaxScaler()
normalized_numerical = scaler.fit_transform(data.select_dtypes(include=['number']))

# Create a new DataFrame with the one-hot encoded categorical features and normalized numerical features
new_data = pd.concat([pd.DataFrame(normalized_numerical, columns=data.select_dtypes(include=['number']).columns), encoded_categorical], axis=1)

# Print the new DataFrame
new_data


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.301370,0.044302,0.800000,0.021740,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.452055,0.048238,0.800000,0.000000,0.0,0.122449,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.287671,0.138113,0.533333,0.000000,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.493151,0.151068,0.400000,0.000000,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.150685,0.221488,0.800000,0.000000,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,0.166404,0.733333,0.000000,0.0,0.377551,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32557,0.315068,0.096500,0.533333,0.000000,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32558,0.561644,0.094827,0.533333,0.000000,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32559,0.068493,0.128499,0.533333,0.000000,0.0,0.193878,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [173]:
numerical_features =  [col for col in data.columns if col in data.select_dtypes(include=['number']).columns]
categorical_features =  [col for col in data.columns if col not in numerical_features]  # Replace with your actual categorical feature names

# Define the pipeline steps
steps = []

# Add a step to one-hot encode the categorical features
categorical_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')
steps.append(('categorical_transformer', categorical_transformer))

# Add a step to normalize the numerical features
numerical_transformer = MinMaxScaler()
steps.append(('numerical_transformer', numerical_transformer))

# Create the pipeline
pipeline = Pipeline(steps)

# Apply the pipeline to transform the training data
transformed_data = pipeline.fit_transform(data)



In [174]:
transformed_data

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [203]:
numerical_transformer = MinMaxScaler()
categorical_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Create a ColumnTransformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)
    ])

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit the pipeline on the training data
pipeline.fit(train_data)

# Apply the pipeline to transform the training data
transformed_train_data = pipeline.transform(train_data)

# Get the transformed column names
feature_names = numerical_features + categorical_features
transformed_categorical_feature_names = []
for i, feature_name in enumerate(categorical_features):
    transformed_categories = pipeline.named_steps['preprocessor'].named_transformers_['categorical'].categories_[i]
    for category in transformed_categories:
        transformed_categorical_feature_names.append(f"{feature_name}_{category}")

column_names = numerical_features + transformed_categorical_feature_names

# Convert the transformed training data back to a DataFrame
transformed_train_data_df = pd.DataFrame(transformed_train_data, columns=column_names)

# Print the transformed training data DataFrame
transformed_train_data_df

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.301370,0.044302,0.800000,0.021740,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.452055,0.048238,0.800000,0.000000,0.0,0.122449,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.287671,0.138113,0.533333,0.000000,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.493151,0.151068,0.400000,0.000000,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.150685,0.221488,0.800000,0.000000,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,0.166404,0.733333,0.000000,0.0,0.377551,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32557,0.315068,0.096500,0.533333,0.000000,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32558,0.561644,0.094827,0.533333,0.000000,0.0,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32559,0.068493,0.128499,0.533333,0.000000,0.0,0.193878,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [204]:
transformed_test_data = pipeline.transform(test_data)

# Convert the transformed test data back to a DataFrame
transformed_test_data_df = pd.DataFrame(transformed_test_data, columns=column_names)

# Print the transformed test data DataFrame
transformed_test_data_df

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.342466,0.108680,0.600000,0.000000,0.000000,0.448980,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.602740,0.131366,0.533333,0.000000,0.000000,0.295918,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.424658,0.071154,0.133333,0.000000,0.000000,1.000000,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.616438,0.118282,0.600000,0.000000,0.000000,0.428571,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.164384,0.125709,0.400000,0.000000,0.000000,0.448980,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1623,0.232877,0.054963,0.866667,0.000000,0.000000,0.295918,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1624,0.191781,0.211692,0.533333,0.000000,0.000000,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1625,0.369863,0.058743,0.600000,0.051781,0.000000,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1626,0.356164,0.131342,0.600000,0.000000,0.000000,0.755102,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [209]:
# Define the transformers for numerical and categorical features
numerical_transformer = MinMaxScaler()
categorical_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Create a ColumnTransformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)
    ])

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the training data
transformed_train_data = pipeline.fit_transform(train_data)

# Get the transformed column names
feature_names = numerical_features + list(pipeline.named_steps['preprocessor']
                                          .named_transformers_['categorical']
                                          .get_feature_names_out(categorical_features))

# Convert the transformed training data back to a DataFrame
transformed_train_data_df = pd.DataFrame(transformed_train_data, columns=feature_names)

# Print the transformed training data DataFrame
print(transformed_train_data_df)

# Transform the test data
transformed_test_data = pipeline.transform(test_data)

# Convert the transformed test data back to a DataFrame
transformed_test_data_df = pd.DataFrame(transformed_test_data, columns=feature_names)

# Print the transformed test data DataFrame
print(transformed_test_data_df)

AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names_out'

In [210]:
data2 = data.copy()

In [212]:
data2[categorical_features] = data2[categorical_features].astype('category')
data2[categorical_features] = data2[categorical_features].apply(lambda x: x.cat.codes)

# Print the updated DataFrame
data2

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,257302,7,12,2,13,5,4,0,0,0,38,39
32557,40,4,154374,11,9,2,7,0,4,1,0,0,40,39
32558,58,4,151910,11,9,6,1,4,4,0,0,0,40,39
32559,22,4,201490,11,9,4,1,3,4,1,0,0,20,39


# Start here

In [5]:
X_train_original.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')

In [6]:
sel = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_train = X_train_original[sel].copy()
df_test = X_test_original[sel].copy()


numerical_features =  [col for col in df_train.columns if col in df_train.select_dtypes(include=['number']).columns]
categorical_features =  [col for col in df_train.columns if col not in numerical_features]  # Replace with your actual categorical feature names




df_train[numerical_features] = scaler.fit_transform(df_train[numerical_features])
df_test[numerical_features] = scaler.fit_transform(df_test[numerical_features])


data2 = pd.concat([df_train,df_test],axis=0)
data2[categorical_features] = data2[categorical_features].astype('category')
data2[categorical_features] = data2[categorical_features].apply(lambda x: x.cat.codes)


X_train1 = data2.iloc[0:len(df_train)]
X_test1 = data2.iloc[len(df_train):len(df_train)+len(df_test)]
X_test1

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
1998,0.378788,7,0.199062,15,0.600000,2,4,0,4,1,0.000000,0.000000,0.448980,39
3590,0.666667,2,0.242629,11,0.533333,2,10,0,4,1,0.000000,0.000000,0.295918,39
1156,0.469697,0,0.127000,4,0.133333,0,0,1,4,1,0.000000,0.000000,1.000000,39
3156,0.681818,4,0.217503,15,0.600000,0,13,4,4,0,0.000000,0.000000,0.428571,39
14403,0.181818,4,0.231766,1,0.400000,4,3,1,4,1,0.000000,0.000000,0.448980,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5699,0.257576,4,0.095906,12,0.866667,2,8,0,4,1,0.000000,0.000000,0.295918,39
10742,0.212121,4,0.396884,11,0.533333,2,4,5,4,0,0.000000,0.000000,0.397959,39
537,0.409091,4,0.103165,15,0.600000,2,3,0,4,1,0.051781,0.000000,0.397959,39
9412,0.393939,1,0.242582,15,0.600000,2,13,0,4,1,0.000000,0.000000,0.755102,39


In [8]:
from sklearn.preprocessing import MinMaxScaler
#no min max scaler
#scaler = MinMaxScaler()
df_train = X_train_original[sel].copy()
df_test = X_test_original[sel].copy()


numerical_features =  [col for col in df_train.columns if col in df_train.select_dtypes(include=['number']).columns]
categorical_features =  [col for col in df_train.columns if col not in numerical_features]  # Replace with your actual categorical feature names




#df_train[numerical_features] = scaler.fit_transform(df_train[numerical_features])
#df_test[numerical_features] = scaler.fit_transform(df_test[numerical_features])


data2 = pd.concat([df_train,df_test],axis=0)
data2[categorical_features] = data2[categorical_features].astype('category')
data2[categorical_features] = data2[categorical_features].apply(lambda x: x.cat.codes)


X_train1 = data2.iloc[0:len(df_train)]
X_test1 = data2.iloc[len(df_train):len(df_train)+len(df_test)]
X_test1

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
1998,42,7,172307,15,10,2,4,0,4,1,0,0,45,39
3590,61,2,205711,11,9,2,10,0,4,1,0,0,30,39
1156,48,0,117054,4,3,0,0,1,4,1,0,0,99,39
3156,62,4,186446,15,10,0,13,4,4,0,0,0,43,39
14403,29,4,197382,1,7,4,3,1,4,1,0,0,45,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5699,34,4,93213,12,14,2,8,0,4,1,0,0,30,39
10742,31,4,323985,11,9,2,4,5,4,0,0,0,40,39
537,44,4,98779,15,10,2,3,0,4,1,5178,0,40,39
9412,43,1,205675,15,10,2,13,0,4,1,0,0,75,39


In [9]:
import numpy as np
value_to_map = '>50K'

# Map the desired values to 1 and 0 using the 'where' function
y_test = np.where(y_test == value_to_map, 1, 0)
y_test

array([0, 1, 0, ..., 1, 1, 1])

In [10]:
y_train = np.where(y_train == value_to_map, 1, 0)


In [11]:
y_train = y_train.T[0]

In [28]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp38-cp38-win_amd64.whl (101.0 MB)
Installing collected packages: catboost
Successfully installed catboost-1.2
Collecting catboost
  Downloading catboost-1.2-cp38-cp38-win_amd64.whl (101.0 MB)
Installing collected packages: catboost
Successfully installed catboost-1.2
Note: you may need to restart the kernel to use updated packages.


In [33]:
X_train1

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,0.301370,7,0.044302,9,0.800000,4,1,1,4,1,0.021740,0.0,0.397959,39
1,0.452055,6,0.048238,9,0.800000,2,4,0,4,1,0.000000,0.0,0.122449,39
2,0.287671,4,0.138113,11,0.533333,0,6,1,4,1,0.000000,0.0,0.397959,39
3,0.493151,4,0.151068,1,0.400000,2,6,0,2,1,0.000000,0.0,0.397959,39
4,0.150685,4,0.221488,9,0.800000,2,10,5,2,0,0.000000,0.0,0.397959,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,4,0.166404,7,0.733333,2,13,5,4,0,0.000000,0.0,0.377551,39
32557,0.315068,4,0.096500,11,0.533333,2,7,0,4,1,0.000000,0.0,0.397959,39
32558,0.561644,4,0.094827,11,0.533333,6,1,4,4,0,0.000000,0.0,0.397959,39
32559,0.068493,4,0.128499,11,0.533333,4,1,3,4,1,0.000000,0.0,0.193878,39


In [62]:
from catboost import CatBoostClassifier

# Define the CatBoost classifier with max iterations set to 20
model = CatBoostClassifier(iterations=40,cat_features=categorical_features,random_seed=1)

# Fit the model with the training data
model.fit(X_train1, y_train)


Learning rate set to 0.5
0:	learn: 0.4831551	total: 40.3ms	remaining: 1.57s
1:	learn: 0.4020440	total: 79.9ms	remaining: 1.52s
2:	learn: 0.3620845	total: 122ms	remaining: 1.5s
3:	learn: 0.3385397	total: 162ms	remaining: 1.46s
4:	learn: 0.3238887	total: 206ms	remaining: 1.44s
5:	learn: 0.3164782	total: 251ms	remaining: 1.42s
6:	learn: 0.3112738	total: 292ms	remaining: 1.38s
7:	learn: 0.3062636	total: 340ms	remaining: 1.36s
8:	learn: 0.3037247	total: 380ms	remaining: 1.31s
9:	learn: 0.3012986	total: 420ms	remaining: 1.26s
10:	learn: 0.2987607	total: 463ms	remaining: 1.22s
11:	learn: 0.2969595	total: 501ms	remaining: 1.17s
12:	learn: 0.2957377	total: 540ms	remaining: 1.12s
13:	learn: 0.2944975	total: 579ms	remaining: 1.07s
14:	learn: 0.2935625	total: 617ms	remaining: 1.03s
15:	learn: 0.2911090	total: 658ms	remaining: 986ms
16:	learn: 0.2908116	total: 696ms	remaining: 942ms
17:	learn: 0.2895200	total: 735ms	remaining: 898ms
18:	learn: 0.2888280	total: 772ms	remaining: 854ms
19:	learn: 0.28

<catboost.core.CatBoostClassifier at 0x1f7be27e280>

In [63]:
from sklearn.metrics import average_precision_score

y_pred_prob = model.predict_proba(X_test1)[:,1]
average_precision_score(y_test, y_pred_prob)

0.8100562340265048

In [64]:
y_pred_prob_train = model.predict_proba(X_train1)[:,1]
average_precision_score(y_train, y_pred_prob_train)

0.8322543042841081

In [20]:
Xtrain2 = X_train1.copy()
Xtest2 = X_test1.copy()

### Train test split and fit model, add random columns to evaluate feature importance

In [21]:
X_train2 = Xtrain2
X_test2 = Xtest2

In [35]:
X_train2['Random Column'] = np.random.rand(len(X_train2))
X_test2['Random Column'] = np.random.rand(len(X_test2))

In [37]:
X_train2['Random Column2'] = np.random.rand(len(X_train2))
X_test2['Random Column2'] = np.random.rand(len(X_test2))

In [39]:
X_train2['Random Column3'] = np.random.rand(len(X_train2))
X_test2['Random Column3'] = np.random.rand(len(X_test2))

In [40]:
from catboost import CatBoostClassifier

# Define the CatBoost classifier with max iterations set to 20
model = CatBoostClassifier(iterations=20,cat_features=categorical_features,random_seed=1)

# Fit the model with the training data
model.fit(X_train2, y_train)

y_pred_prob_train = model.predict_proba(X_train2)[:,1]
print(average_precision_score(y_train, y_pred_prob_train))

y_pred_prob = model.predict_proba(X_test2)[:,1]
print(average_precision_score(y_test, y_pred_prob))

Learning rate set to 0.5
0:	learn: 0.4855987	total: 46.4ms	remaining: 881ms
1:	learn: 0.4048358	total: 82.8ms	remaining: 746ms
2:	learn: 0.3646410	total: 108ms	remaining: 611ms
3:	learn: 0.3401579	total: 134ms	remaining: 535ms
4:	learn: 0.3259650	total: 158ms	remaining: 473ms
5:	learn: 0.3182077	total: 180ms	remaining: 419ms
6:	learn: 0.3122553	total: 207ms	remaining: 384ms
7:	learn: 0.3080642	total: 231ms	remaining: 347ms
8:	learn: 0.3045577	total: 254ms	remaining: 310ms
9:	learn: 0.3010724	total: 277ms	remaining: 277ms
10:	learn: 0.2989437	total: 300ms	remaining: 245ms
11:	learn: 0.2974301	total: 322ms	remaining: 215ms
12:	learn: 0.2955008	total: 344ms	remaining: 185ms
13:	learn: 0.2944525	total: 368ms	remaining: 158ms
14:	learn: 0.2924513	total: 391ms	remaining: 130ms
15:	learn: 0.2906107	total: 419ms	remaining: 105ms
16:	learn: 0.2899630	total: 443ms	remaining: 78.2ms
17:	learn: 0.2893012	total: 466ms	remaining: 51.7ms
18:	learn: 0.2888456	total: 489ms	remaining: 25.7ms
19:	learn: 

In [239]:
distances

array([[0.00477377, 0.00477377, 0.00477377, 0.00477377, 0.00477377],
       [0.00456621, 0.00456621, 0.00456621, 0.00456621, 0.00456621],
       [0.00394355, 0.00394355, 0.00394355, 0.00394355, 0.00394355],
       ...,
       [0.001868  , 0.001868  , 0.001868  , 0.001868  , 0.001868  ],
       [0.00332088, 0.00332088, 0.00332088, 0.00332088, 0.00332088],
       [0.00373599, 0.00373599, 0.00373599, 0.00373599, 0.00373599]])

## Use feature importance to weight the importance per feature when implementing similarity score using Kmeans

In [35]:
#cols = list(feat_importance_dict.keys())
#feat_importance = list(feat_importance_dict.values())
cols = list(X_train1.columns)
feat_importance = [0,1,1,1,1,1,1,1,1,1,1,1,1,1]

In [107]:
df_train.iloc[indices[5].tolist()]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,truth
14331,36,Private,356824,HS-grad,9,Divorced,Sales,Not-in-family,White,Female,0,0,40,United-States,0
18908,55,Private,284095,HS-grad,9,Divorced,Sales,Not-in-family,White,Female,0,0,37,United-States,0
11113,55,Private,220262,HS-grad,9,Divorced,Sales,Not-in-family,White,Female,0,0,40,United-States,0
13882,33,Private,322873,HS-grad,9,Divorced,Sales,Not-in-family,White,Female,0,0,50,United-States,0
4954,36,Private,169926,HS-grad,9,Divorced,Sales,Not-in-family,White,Female,0,0,40,United-States,0


In [108]:
df_test.iloc[5].to_frame().T

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,truth,pred,S_Score
15359,46,Private,191821,HS-grad,9,Divorced,Sales,Not-in-family,White,Female,0,0,40,United-States,0,0.219688,0.973956


In [95]:
distances[1]

array([0.20672845, 0.46129748, 1.10541832, 1.10753239, 1.10902055])

In [109]:
indices

array([[ 2609, 15514, 30249, 17985,  1339],
       [11518,  7851, 13516,  2692,  6577],
       [ 1544, 10143, 16065,   221, 15846],
       ...,
       [27868,   276, 28583, 11587, 11582],
       [18091, 32076, 11246,  2028, 24447],
       [ 6695, 31477, 31957,  7468, 30153]], dtype=int64)

## Applying weighted nearest neigbor using feature importance to weight more important features, output similarity score to find instances that have similar rows and others that dont

In [86]:
#import pandas as pd
from sklearn.neighbors import NearestNeighbors

# Assuming you have preprocessed training data 'X_train'
# Assuming you have test data 'X_test'
# Assuming you have feature weights in 'feature_weights'
# Assuming you have feature importances in 'feature_importances'

# Initialize the NearestNeighbors model
k = 5  # Number of nearest neighbors to consider

# Define a custom distance function based on your categorical features
def custom_distance(x1, x2):
    # Assuming x1 and x2 are instances with categorical features

    # Compute the distance based on your preference
    distance = 0
    for i in range(len(x1)):
        # Handle numerical features
        feat_distance = abs(x1[i] - x2[i]) 
        if feat_distance >= 1:
            distance += (1 * feat_importance[i])  # Euclidean distance or other distance metric
            #print(x1)
            #print(x2)
            #print(X_test1.columns[i],x1[i] ,x2[i],'distance: ',feat_distance)
        else:
            distance += (feat_distance * feat_importance[i])  # Euclidean distance or other distance metric
    #print(distance)
    return distance



nn_model = NearestNeighbors(n_neighbors=k, metric=custom_distance)

# Fit the model with the encoded training data
nn_model.fit(X_train1[cols])

# Find the distances and indices of the k nearest neighbors for each instance in the encoded test data
distances, indices = nn_model.kneighbors(X_test1[cols])

# Calculate the weighted similarity scores for each instance
weighted_similarity_scores = 1 - (distances.mean(axis=1) / distances.max()) #* feature_importances

# Normalize the weighted similarity scores to ensure they are between 0 and 1
normalized_similarity_scores = (weighted_similarity_scores - weighted_similarity_scores.min()) / (weighted_similarity_scores.max() - weighted_similarity_scores.min())

# Convert the normalized similarity scores to a DataFrame
similarity_scores_df = pd.DataFrame({'Similarity Score': normalized_similarity_scores})

similarity_scores_df


Unnamed: 0,Similarity Score
0,0.929915
1,0.737446
2,0.543618
3,0.958933
4,0.961036
...,...
1623,0.888123
1624,0.940530
1625,0.990679
1626,0.854944


In [80]:
X_train1

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,0.301370,7,0.044302,9,0.800000,4,1,1,4,1,0.021740,0.0,0.397959,39
1,0.452055,6,0.048238,9,0.800000,2,4,0,4,1,0.000000,0.0,0.122449,39
2,0.287671,4,0.138113,11,0.533333,0,6,1,4,1,0.000000,0.0,0.397959,39
3,0.493151,4,0.151068,1,0.400000,2,6,0,2,1,0.000000,0.0,0.397959,39
4,0.150685,4,0.221488,9,0.800000,2,10,5,2,0,0.000000,0.0,0.397959,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,4,0.166404,7,0.733333,2,13,5,4,0,0.000000,0.0,0.377551,39
32557,0.315068,4,0.096500,11,0.533333,2,7,0,4,1,0.000000,0.0,0.397959,39
32558,0.561644,4,0.094827,11,0.533333,6,1,4,4,0,0.000000,0.0,0.397959,39
32559,0.068493,4,0.128499,11,0.533333,4,1,3,4,1,0.000000,0.0,0.193878,39


In [87]:
distances

array([[0.18894319, 0.20239315, 0.21179672, 0.22716852, 0.23521259],
       [0.20672845, 0.46129748, 1.10541832, 1.10753239, 1.10902055],
       [0.78796441, 1.22337402, 1.53012821, 1.67660358, 1.71704805],
       ...,
       [0.02254604, 0.02577518, 0.02827576, 0.03062083, 0.03503168],
       [0.39768086, 0.40841625, 0.4200943 , 0.45827159, 0.52019957],
       [0.83227981, 0.83572103, 0.87559609, 0.87804387, 0.88086947]])

In [262]:
sel

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country']

In [111]:
indices

array([[ 2609, 15514, 30249, 17985,  1339],
       [11518,  7851, 13516,  2692,  6577],
       [ 1544, 10143, 16065,   221, 15846],
       ...,
       [27868,   276, 28583, 11587, 11582],
       [18091, 32076, 11246,  2028, 24447],
       [ 6695, 31477, 31957,  7468, 30153]], dtype=int64)

In [112]:
array_as_list = [list(row) for row in indices]

# Assign the list to each corresponding row in the DataFrame
df_test = df_test.assign(neigbour=array_as_list)
df_test['neigbour']

1998       [2609, 15514, 30249, 17985, 1339]
3590        [11518, 7851, 13516, 2692, 6577]
1156        [1544, 10143, 16065, 221, 15846]
3156      [7028, 21268, 28677, 27880, 22929]
14403       [7212, 6899, 4243, 27455, 19188]
                        ...                 
5699     [28292, 27953, 10165, 25729, 31400]
10742      [6686, 5725, 30730, 20488, 16826]
537        [27868, 276, 28583, 11587, 11582]
9412      [18091, 32076, 11246, 2028, 24447]
12463      [6695, 31477, 31957, 7468, 30153]
Name: neigbour, Length: 1628, dtype: object

In [123]:
df_train.iloc[[2609, 15514, 30249, 17985, 1339]]#['truth'].mean()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,truth
2609,45,State-gov,102308,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,0
15514,51,State-gov,82504,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,0
30249,56,State-gov,68658,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
17985,44,State-gov,96249,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,3411,0,40,United-States,0
1339,38,State-gov,34180,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,0


In [124]:
df_test['neigbours_mean'] = df_test['neigbour'].apply(lambda x: df_train.iloc[x]['truth'].mean())
df_test['neigbours_mean']

1998     0.2
3590     0.4
1156     0.0
3156     0.0
14403    0.0
        ... 
5699     0.2
10742    0.6
537      0.6
9412     0.6
12463    0.0
Name: neigbours_mean, Length: 1628, dtype: float64

In [126]:
df_test[df_test.education=='HS-grad']

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,truth,pred,S_Score,neigbour,neigbours_mean
3590,61,Local-gov,205711,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,30,United-States,1,0.232571,0.737446,"[11518, 7851, 13516, 2692, 6577]",0.4
15359,46,Private,191821,HS-grad,9,Divorced,Sales,Not-in-family,White,Female,0,0,40,United-States,0,0.219688,0.973956,"[14331, 18908, 11113, 13882, 4954]",0.0
4428,28,Private,115438,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,50,United-States,1,0.230736,0.996701,"[10267, 1147, 17066, 11901, 29759]",0.4
5688,52,Federal-gov,157454,HS-grad,9,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States,0,0.219688,0.665171,"[25686, 28230, 5579, 2763, 31484]",0.0
6480,45,Private,39464,HS-grad,9,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0,0.219688,0.995704,"[31922, 24344, 14598, 3332, 803]",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7566,41,?,188436,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,20,Canada,0,0.232571,0.804152,"[20953, 11192, 3297, 13203, 499]",0.4
11449,30,Private,183017,HS-grad,9,Divorced,Machine-op-inspct,Own-child,White,Male,0,0,40,United-States,0,0.219688,0.972383,"[28927, 11702, 11607, 17615, 23611]",0.0
4307,64,Private,186731,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0,0.219688,0.973234,"[28726, 22381, 21568, 18940, 32558]",0.0
10742,31,Private,323985,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,1,0.242416,0.940530,"[6686, 5725, 30730, 20488, 16826]",0.6


## Check metric performance (MAP) Per group relative to average MAP for entire dataset

In [18]:
sim_score=False
df_train = X_train_original[sel].copy()
df_test = X_test_original[sel].copy()
df_train['truth'] = y_train
df_train['pred'] = model.predict_proba(X_train1)[:,1]
df_test['truth'] = y_test
df_test['pred'] = model.predict_proba(X_test1)[:,1]

base_score = average_precision_score(df_test['truth'], df_test['pred'])
total_class_test = df_test['truth'].sum()
total_class_train = df_train['truth'].sum()

train_len = len(df_train)
test_len = len(df_test)


if sim_score:
    df_test['S_Score'] = normalized_similarity_scores
    array_as_list = [list(row) for row in indices]
    df_test = df_test.assign(neigbour=array_as_list)
    df_test['neigbours_mean'] = df_test['neigbour'].apply(lambda x: df_train.iloc[x]['truth'].mean())


features= ['education']


# Define a custom aggregate function to calculate accuracy
def calculate_map(labels,df_len,base=0,total_class=1,mode='test'):
    
    percent_rows = round(100 * len(labels) / df_len,1)
    
    fails_per_group = round(100*labels['truth'].sum()/total_class,1)
    fails_mean = round(labels['truth'].mean(),4)
    ap = average_precision_score(labels['truth'], labels['pred'])
    if mode == 'test':
        
        change_base = round(100*(ap - base_score)/base_score,1)
        if sim_score:
            sscore_mean = round(labels['S_Score'].mean(),2)
            neigbours_mean = round(labels['neigbours_mean'].mean(),4)
            return pd.Series({'MAP': ap, '%change_MAP': change_base,'test_%Rows': percent_rows,\
                              'test_%class1_total':fails_per_group,'test_ratio_class1_group':fails_mean,\
                              'S_Score_mean': sscore_mean,'neigbours_mean':neigbours_mean }) 
        else:
            return pd.Series({'MAP': ap, '%change_MAP': change_base,'test_%Rows': percent_rows,\
                              'test_%class1_total':fails_per_group,'test_ratio_class1_group':fails_mean}) 
            
    else:
        
        return pd.Series({'train_MAP': ap,'train_%Rows': percent_rows,\
                          'train_%class1_total':fails_per_group,'train_ratio_class1_group':fails_mean}) 
        


# Group the data based on the desired feature and calculate accuracy for each group

additional_args_test = {'base': base_score, 'total_class' : total_class_test,'df_len': test_len}
additional_args_train = {'total_class' : total_class_train,'mode':'train','df_len': train_len}

grouped_data = df_test.groupby(features).apply(calculate_map, **additional_args_test)
grouped_data_train = df_train.groupby(features).apply(calculate_map, **additional_args_train)



# Convert the grouped data to a DataFrame
df_results_test = grouped_data.reset_index().sort_values('MAP')#.rename(columns={'grouping_feature': 'Grouping Feature'})
df_results_train = grouped_data_train.reset_index()
df_results_all = pd.merge(df_results_test,df_results_train,on=features,how='left')
df_results_all
df_results_all[df_results_all['test_%Rows']>1]

Unnamed: 0,education,MAP,%change_MAP,test_%Rows,test_%class1_total,test_ratio_class1_group,train_MAP,train_%Rows,train_%class1_total,train_ratio_class1_group
0,5th-6th,0.142857,-82.1,1.2,0.3,0.0526,0.50076,1.0,0.2,0.048
1,7th-8th,0.142857,-82.1,2.1,0.3,0.0294,0.44078,2.0,0.5,0.0619
2,10th,0.291667,-63.4,2.3,0.5,0.0541,0.525759,2.9,0.8,0.0665
4,11th,0.65184,-18.3,4.1,1.8,0.1045,0.591893,3.6,0.8,0.0511
5,HS-grad,0.670067,-16.0,32.5,23.6,0.172,0.667364,32.3,21.4,0.1595
6,Assoc-voc,0.709076,-11.1,5.1,5.5,0.253,0.769196,4.2,4.6,0.2612
7,Assoc-acdm,0.726884,-8.8,3.4,3.6,0.25,0.794167,3.3,3.4,0.2484
8,Some-college,0.728521,-8.6,22.7,16.9,0.1757,0.742929,22.4,17.7,0.1902
9,9th,0.75,-5.9,1.6,0.5,0.0769,0.429985,1.6,0.3,0.0525
10,Bachelors,0.855981,7.3,15.2,27.0,0.4211,0.874538,16.4,28.3,0.4148


In [100]:
sample_weights = [5 if edu == "HS-grad" else 1 for edu in X_train_original["education"]]

In [125]:
sample_weights = [1] * len(X_train)
for i, (x, y) in enumerate(zip(X_train_original["education"], y_train)):
    if y == 1:
        sample_weights[i] = 2
    #if x == "HS-grad":
     #   sample_weights[i] += 1

In [134]:
y_test

array([0, 1, 0, ..., 1, 1, 1])

In [87]:
df_all = X_test_original.copy()
df_all['pred'] = y_test
df_all_hs = df_all[df_all.education=="HS-grad"]

In [74]:
sum(y_train)/len(y_train)

0.2408095574460244

In [65]:
def custom_loss(y_true, y_pred, sample_weight):
    # Assign higher penalties for missing certain samples
    penalty = np.where(sample_weight == 0, 2, 1)
    loss = np.mean(np.multiply(penalty, np.square(y_true - y_pred)))
    return loss

In [136]:
from catboost import Pool, CatBoostClassifier

# Create your training dataset with sample weights
#train_data = Pool(X_train1, y_train, weight=sample_weights)

# Initialize and train your CatBoost model with the custom loss function and sample weights
model2 = CatBoostClassifier(iterations=40,cat_features=categorical_features,random_seed=42)
model2.fit(X_train1, y_train, sample_weight=sample_weights)

y_pred_prob2 = model2.predict_proba(X_test2)[:,1]
average_precision_score(y_test, y_pred_prob2)

Learning rate set to 0.5
0:	learn: 0.5177497	total: 25.1ms	remaining: 980ms
1:	learn: 0.4463860	total: 48.4ms	remaining: 919ms
2:	learn: 0.4076536	total: 70.9ms	remaining: 875ms
3:	learn: 0.3904202	total: 94.6ms	remaining: 852ms
4:	learn: 0.3796932	total: 116ms	remaining: 815ms
5:	learn: 0.3703496	total: 138ms	remaining: 783ms
6:	learn: 0.3635808	total: 160ms	remaining: 756ms
7:	learn: 0.3601675	total: 183ms	remaining: 732ms
8:	learn: 0.3572147	total: 205ms	remaining: 707ms
9:	learn: 0.3540973	total: 232ms	remaining: 695ms
10:	learn: 0.3522646	total: 256ms	remaining: 674ms
11:	learn: 0.3484465	total: 278ms	remaining: 649ms
12:	learn: 0.3472498	total: 300ms	remaining: 624ms
13:	learn: 0.3447958	total: 321ms	remaining: 597ms
14:	learn: 0.3431229	total: 342ms	remaining: 570ms
15:	learn: 0.3414288	total: 363ms	remaining: 545ms
16:	learn: 0.3407829	total: 384ms	remaining: 520ms
17:	learn: 0.3403885	total: 409ms	remaining: 500ms
18:	learn: 0.3392545	total: 433ms	remaining: 478ms
19:	learn: 0

0.8093018360035997

In [137]:
y_pred_prob2 = model2.predict_proba(X_train2)[:,1]
average_precision_score(y_train, y_pred_prob2)

0.8331694519451784

In [83]:
X_test_original

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
1998,42,State-gov,172307,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States
3590,61,Local-gov,205711,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,30,United-States
1156,48,?,117054,5th-6th,3,Divorced,?,Not-in-family,White,Male,0,0,99,United-States
3156,62,Private,186446,Some-college,10,Divorced,Tech-support,Unmarried,White,Female,0,0,43,United-States
14403,29,Private,197382,11th,7,Never-married,Craft-repair,Not-in-family,White,Male,0,0,45,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5699,34,Private,93213,Masters,14,Married-civ-spouse,Other-service,Husband,White,Male,0,0,30,United-States
10742,31,Private,323985,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States
537,44,Private,98779,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,5178,0,40,United-States
9412,43,Federal-gov,205675,Some-college,10,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,75,United-States


In [85]:
X_test2[X_test2['education-num']==9]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,Random Column,Random Column2,Random Column3
3590,61,2,205711,11,9,2,10,0,4,1,0,0,30,39,0.192479,0.484492,0.112364
15359,46,4,191821,11,9,0,12,1,4,0,0,0,40,39,0.320825,0.052617,0.058938
4428,28,4,115438,11,9,2,7,0,4,1,0,0,50,39,0.086703,0.652387,0.851190
5688,52,1,157454,11,9,4,7,1,4,0,0,0,40,39,0.137620,0.297187,0.018956
6480,45,4,39464,11,9,0,1,4,4,0,0,0,40,39,0.213604,0.497229,0.831280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7566,41,0,188436,11,9,2,0,0,4,1,0,0,20,2,0.328379,0.772657,0.603399
11449,30,4,183017,11,9,0,7,3,4,1,0,0,40,39,0.114022,0.385435,0.298030
4307,64,4,186731,11,9,6,1,4,4,0,0,0,40,39,0.582440,0.461029,0.281165
10742,31,4,323985,11,9,2,4,5,4,0,0,0,40,39,0.975895,0.297631,0.440498


In [138]:
y_pred_prob2 = model2.predict_proba(X_test2[X_test2['education-num']==9])[:,1]
average_precision_score(df_all_hs['pred'], y_pred_prob2)

0.6802009010117471

In [None]:
# Define the CatBoost classifier with max iterations set to 20
model2 = CatBoostClassifier(iterations=20,cat_features=categorical_features, class_weights=[1, 4],random_seed=42)

# Fit the model with the training data
model2.fit(X_train_original, y_train)


In [151]:
# Define the CatBoost classifier with max iterations set to 20
model2 = CatBoostClassifier(iterations=20,cat_features=categorical_features, class_weights=[1, 4],random_seed=42)

# Fit the model with the training data
model2.fit(X_train_original, y_train)

y_pred_prob2 = model2.predict_proba(X_test_original)[:,1]
average_precision_score(y_test, y_pred_prob2)

Learning rate set to 0.5
0:	learn: 0.5196429	total: 26.5ms	remaining: 504ms
1:	learn: 0.4501998	total: 54.1ms	remaining: 486ms
2:	learn: 0.4178338	total: 78.8ms	remaining: 446ms
3:	learn: 0.3958506	total: 104ms	remaining: 416ms
4:	learn: 0.3826767	total: 130ms	remaining: 391ms
5:	learn: 0.3745313	total: 157ms	remaining: 367ms
6:	learn: 0.3691147	total: 183ms	remaining: 340ms
7:	learn: 0.3663223	total: 205ms	remaining: 307ms
8:	learn: 0.3632921	total: 231ms	remaining: 283ms
9:	learn: 0.3605925	total: 256ms	remaining: 256ms
10:	learn: 0.3581642	total: 280ms	remaining: 229ms
11:	learn: 0.3552436	total: 306ms	remaining: 204ms
12:	learn: 0.3524914	total: 330ms	remaining: 178ms
13:	learn: 0.3508855	total: 353ms	remaining: 151ms
14:	learn: 0.3489830	total: 377ms	remaining: 126ms
15:	learn: 0.3481430	total: 406ms	remaining: 102ms
16:	learn: 0.3470688	total: 432ms	remaining: 76.2ms
17:	learn: 0.3470631	total: 445ms	remaining: 49.5ms
18:	learn: 0.3455060	total: 470ms	remaining: 24.7ms
19:	learn:

0.7973357608856815

In [152]:
X_train_original

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [153]:
X_train1

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,0.301370,7,0.044302,9,0.800000,4,1,1,4,1,0.021740,0.0,0.397959,39
1,0.452055,6,0.048238,9,0.800000,2,4,0,4,1,0.000000,0.0,0.122449,39
2,0.287671,4,0.138113,11,0.533333,0,6,1,4,1,0.000000,0.0,0.397959,39
3,0.493151,4,0.151068,1,0.400000,2,6,0,2,1,0.000000,0.0,0.397959,39
4,0.150685,4,0.221488,9,0.800000,2,10,5,2,0,0.000000,0.0,0.397959,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,4,0.166404,7,0.733333,2,13,5,4,0,0.000000,0.0,0.377551,39
32557,0.315068,4,0.096500,11,0.533333,2,7,0,4,1,0.000000,0.0,0.397959,39
32558,0.561644,4,0.094827,11,0.533333,6,1,4,4,0,0.000000,0.0,0.397959,39
32559,0.068493,4,0.128499,11,0.533333,4,1,3,4,1,0.000000,0.0,0.193878,39


# min max scaler decreased accuracy!

In [155]:
# Define the CatBoost classifier with max iterations set to 20
model2 = CatBoostClassifier(iterations=20,cat_features=categorical_features, class_weights=[1, 4],random_seed=42)

# Fit the model with the training data
model2.fit(X_train1, y_train)

y_pred_prob2 = model2.predict_proba(X_test1)[:,1]
average_precision_score(y_test, y_pred_prob2)

Learning rate set to 0.5
0:	learn: 0.5196429	total: 46.6ms	remaining: 886ms
1:	learn: 0.4501998	total: 91.9ms	remaining: 827ms
2:	learn: 0.4178338	total: 134ms	remaining: 758ms
3:	learn: 0.3958506	total: 178ms	remaining: 711ms
4:	learn: 0.3826767	total: 229ms	remaining: 686ms
5:	learn: 0.3745313	total: 273ms	remaining: 638ms
6:	learn: 0.3691147	total: 314ms	remaining: 583ms
7:	learn: 0.3663223	total: 344ms	remaining: 517ms
8:	learn: 0.3632921	total: 387ms	remaining: 473ms
9:	learn: 0.3605925	total: 431ms	remaining: 431ms
10:	learn: 0.3581642	total: 477ms	remaining: 390ms
11:	learn: 0.3552436	total: 519ms	remaining: 346ms
12:	learn: 0.3524914	total: 563ms	remaining: 303ms
13:	learn: 0.3508855	total: 605ms	remaining: 259ms
14:	learn: 0.3489830	total: 648ms	remaining: 216ms
15:	learn: 0.3481430	total: 694ms	remaining: 173ms
16:	learn: 0.3470688	total: 738ms	remaining: 130ms
17:	learn: 0.3470631	total: 763ms	remaining: 84.7ms
18:	learn: 0.3455060	total: 805ms	remaining: 42.4ms
19:	learn: 0

0.7973357608856815

In [144]:
threshold = 0.5  # Adjust the threshold as needed #catboost
y_pred = (y_pred_prob2 > threshold).astype(int)
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.77      0.85      1243
           1       0.55      0.88      0.67       385

    accuracy                           0.80      1628
   macro avg       0.75      0.83      0.76      1628
weighted avg       0.86      0.80      0.81      1628



In [145]:
y_pred_prob2 = model2.predict_proba(df_all_hs.drop('pred',axis=1))[:,1]
average_precision_score(df_all_hs['pred'], y_pred_prob2)

0.6536143340157508

In [146]:
threshold = 0.5  # Adjust the threshold as needed #catboost
y_pred = (y_pred_prob2 > threshold).astype(int)
from sklearn.metrics import classification_report

print(classification_report(df_all_hs['pred'], y_pred))

              precision    recall  f1-score   support

           0       0.95      0.81      0.88       438
           1       0.47      0.80      0.59        91

    accuracy                           0.81       529
   macro avg       0.71      0.81      0.73       529
weighted avg       0.87      0.81      0.83       529



In [81]:
cols = sel

In [83]:
cols

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country']

In [84]:
feat_importance = [1,1,1,1,1,1,1,1,1,1,1,1,1,1]
len(feat_importance)

14

In [85]:
#import pandas as pd
from sklearn.neighbors import NearestNeighbors

# Assuming you have preprocessed training data 'X_train'
# Assuming you have test data 'X_test'
# Assuming you have feature weights in 'feature_weights'
# Assuming you have feature importances in 'feature_importances'

# Initialize the NearestNeighbors model
k = 5  # Number of nearest neighbors to consider

# Define a custom distance function based on your categorical features
def custom_distance(x1, x2):
    # Assuming x1 and x2 are instances with categorical features

    # Compute the distance based on your preference
    distance = 0
    for i in range(len(x1)):
        # Handle numerical features
        feat_distance = abs(x1[i] - x2[i]) 
        if feat_distance >= 1:
            distance += (1 * feat_importance[i])  # Euclidean distance or other distance metric
            #print(x1)
            #print(x2)
            #print(X_test1.columns[i],x1[i] ,x2[i],'distance: ',feat_distance)
        else:
            distance += (feat_distance * feat_importance[i])  # Euclidean distance or other distance metric
    #print(distance)
    return distance



nn_model = NearestNeighbors(n_neighbors=k, metric=custom_distance)

# Fit the model with the encoded training data
nn_model.fit(X_train1[cols],y_train)

# Find the distances and indices of the k nearest neighbors for each instance in the encoded test data
distances, indices = nn_model.kneighbors(X_test1[cols])

# Calculate the weighted similarity scores for each instance
weighted_similarity_scores = 1 - (distances.mean(axis=1) / distances.max()) #* feature_importances

# Normalize the weighted similarity scores to ensure they are between 0 and 1
normalized_similarity_scores = (weighted_similarity_scores - weighted_similarity_scores.min()) / (weighted_similarity_scores.max() - weighted_similarity_scores.min())

# Convert the normalized similarity scores to a DataFrame
similarity_scores_df = pd.DataFrame({'Similarity Score': normalized_similarity_scores})

similarity_scores_df


Unnamed: 0,Similarity Score
0,0.916435
1,0.696130
2,0.513902
3,0.844155
4,0.951078
...,...
1623,0.865897
1624,0.925567
1625,0.983710
1626,0.823145


In [89]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=k, metric=custom_distance)

# Fit the model with the training data
knn_model.fit(X_train1, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30,
                     metric=<function custom_distance at 0x00000221ABD4C8B0>,
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [92]:
average_precision_score(y_test, y_pred_prob2[:,1])

0.5910511830920004

In [110]:
y_train[indices]

array([[0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [1, 1, 1, 0, 1],
       [0, 1, 1, 1, 1],
       [0, 0, 0, 1, 0]])

In [113]:
distances

array([[0.19371695, 0.23609339, 0.2893587 , 0.32632924, 0.35366405],
       [0.5035321 , 0.86769017, 1.19363126, 1.19885179, 1.21203447],
       [1.03059621, 1.44649565, 1.78564391, 1.82068924, 1.85074223],
       ...,
       [0.03723079, 0.057832  , 0.06500308, 0.07328144, 0.07361656],
       [0.48234797, 0.57752434, 0.58981932, 0.62723669, 0.63676081],
       [0.88460546, 0.89703697, 0.9341266 , 0.94990394, 0.97933171]])

In [114]:
weighted_similarity_scores

array([0.9211594 , 0.71962486, 0.55292211, ..., 0.98270307, 0.83581818,
       0.73826127])

In [126]:
normalized_distances = distances.min(axis=1, keepdims=True)  / distances
normalized_distances = normalized_distances  / normalized_distances.sum(axis=1, keepdims=True)

print(normalized_distances)

[[0.27537991 0.22595194 0.18435858 0.1634722  0.15083738]
 [0.35240827 0.20450719 0.14866306 0.14801569 0.1464058 ]
 [0.29303708 0.20878245 0.16912829 0.16587284 0.16317934]
 ...
 [0.30956678 0.19929133 0.17730571 0.15727607 0.15656011]
 [0.2392633  0.19983255 0.19566698 0.1839946  0.18124257]
 [0.20974647 0.20683972 0.19862712 0.19532804 0.18945866]]


In [157]:
weighted_prob = normalized_distances * y_train[indices]
weighted_prob.sum(axis=1)#, keepdims=True)

array([0.15083738, 0.20450719, 0.        , ..., 0.84272393, 0.7607367 ,
       0.19532804])

In [156]:
weighted_prob

array([[0.        , 0.        , 0.        , 0.        , 0.15083738],
       [0.        , 0.20450719, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       ...,
       [0.30956678, 0.19929133, 0.17730571, 0.        , 0.15656011],
       [0.        , 0.19983255, 0.19566698, 0.1839946 , 0.18124257],
       [0.        , 0.        , 0.        , 0.19532804, 0.        ]])

In [158]:
average_precision_score(y_test, weighted_prob.sum(axis=1))

0.6067570737749551

In [163]:
threshold = 0.2  # Adjust the threshold as needed #knn
y_pred = (weighted_prob.sum(axis=1) > threshold).astype(int)
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.75      0.83      1243
           1       0.50      0.78      0.61       385

    accuracy                           0.76      1628
   macro avg       0.71      0.77      0.72      1628
weighted avg       0.82      0.76      0.78      1628



In [77]:
threshold = 0.5  # Adjust the threshold as needed #catboost
y_pred = (y_pred_prob2 > threshold).astype(int)
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.74      0.84      1243
           1       0.52      0.89      0.66       385

    accuracy                           0.78      1628
   macro avg       0.74      0.82      0.75      1628
weighted avg       0.85      0.78      0.80      1628



In [164]:
sim_score=True
df_train = X_train_original[sel].copy()
df_test = X_test_original[sel].copy()
df_train['truth'] = y_train
df_test['truth'] = y_test
df_test['pred'] = weighted_prob.sum(axis=1)

base_score = average_precision_score(df_test['truth'], df_test['pred'])
total_class_test = df_test['truth'].sum()
total_class_train = df_train['truth'].sum()

train_len = len(df_train)
test_len = len(df_test)


if sim_score:
    df_test['S_Score'] = normalized_similarity_scores
    array_as_list = [list(row) for row in indices]
    df_test = df_test.assign(neigbour=array_as_list)
    df_test['neigbours_mean'] = df_test['neigbour'].apply(lambda x: df_train.iloc[x]['truth'].mean())


features= ['education']


# Define a custom aggregate function to calculate accuracy
def calculate_map(labels,df_len,base=0,total_class=1,mode='test'):
    
    percent_rows = round(100 * len(labels) / df_len,1)
    
    fails_per_group = round(100*labels['truth'].sum()/total_class,1)
    fails_mean = round(labels['truth'].mean(),4)
    if mode == 'test':
        ap = average_precision_score(labels['truth'], labels['pred'])
        change_base = round(100*(ap - base_score)/base_score,1)
        if sim_score:
            sscore_mean = round(labels['S_Score'].mean(),2)
            neigbours_mean = round(labels['neigbours_mean'].mean(),4)
            return pd.Series({'MAP': ap, '%change_MAP': change_base,'test_%Rows': percent_rows,\
                              'test_%class1_total':fails_per_group,'test_ratio_class1_group':fails_mean,\
                              'S_Score_mean': sscore_mean,'neigbours_mean':neigbours_mean }) 
        else:
            return pd.Series({'MAP': ap, '%change_MAP': change_base,'test_%Rows': percent_rows,\
                              'test_%class1_total':fails_per_group,'test_ratio_class1_group':fails_mean}) 
            
    else:
        
        return pd.Series({'train_%Rows': percent_rows,\
                          'train_%class1_total':fails_per_group,'train_ratio_class1_group':fails_mean}) 
        


# Group the data based on the desired feature and calculate accuracy for each group

additional_args_test = {'base': base_score, 'total_class' : total_class_test,'df_len': test_len}
additional_args_train = {'total_class' : total_class_train,'mode':'train','df_len': train_len}

grouped_data = df_test.groupby(features).apply(calculate_map, **additional_args_test)
grouped_data_train = df_train.groupby(features).apply(calculate_map, **additional_args_train)



# Convert the grouped data to a DataFrame
df_results_test = grouped_data.reset_index().sort_values('MAP')#.rename(columns={'grouping_feature': 'Grouping Feature'})
df_results_train = grouped_data_train.reset_index()
df_results_all = pd.merge(df_results_test,df_results_train,on=features,how='left')
df_results_all
df_results_all[df_results_all['test_%Rows']>1]

Unnamed: 0,education,MAP,%change_MAP,test_%Rows,test_%class1_total,test_ratio_class1_group,S_Score_mean,neigbours_mean,train_%Rows,train_%class1_total,train_ratio_class1_group
0,7th-8th,0.029412,-95.2,2.1,0.3,0.0294,0.72,0.0588,2.0,0.5,0.0619
1,5th-6th,0.052632,-91.3,1.2,0.3,0.0526,0.7,0.0211,1.0,0.2,0.048
2,9th,0.076923,-87.3,1.6,0.5,0.0769,0.7,0.0462,1.6,0.3,0.0525
4,10th,0.193694,-68.1,2.3,0.5,0.0541,0.73,0.0378,2.9,0.8,0.0665
5,11th,0.397837,-34.4,4.1,1.8,0.1045,0.79,0.0388,3.6,0.8,0.0511
6,Some-college,0.411569,-32.2,22.7,16.9,0.1757,0.86,0.1751,22.4,17.7,0.1902
7,HS-grad,0.411927,-32.1,32.5,23.6,0.172,0.87,0.158,32.3,21.4,0.1595
8,Assoc-voc,0.414805,-31.6,5.1,5.5,0.253,0.8,0.253,4.2,4.6,0.2612
10,Bachelors,0.713038,17.5,15.2,27.0,0.4211,0.83,0.4308,16.4,28.3,0.4148
11,Masters,0.762449,25.7,5.6,11.4,0.4835,0.79,0.5143,5.3,12.2,0.5566


In [99]:
df_test[df_test.education=='HS-grad']#.tail(20)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,truth,pred,S_Score,neigbour,neigbours_mean
3590,61,Local-gov,205711,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,30,United-States,1,0.2,0.696130,"[11518, 7851, 30502, 1206, 4125]",0.2
15359,46,Private,191821,HS-grad,9,Divorced,Sales,Not-in-family,White,Female,0,0,40,United-States,0,0.0,0.949753,"[18908, 11113, 22977, 17698, 14331]",0.0
4428,28,Private,115438,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,50,United-States,1,0.4,0.990838,"[14176, 29348, 8758, 10267, 5736]",0.4
5688,52,Federal-gov,157454,HS-grad,9,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States,0,0.0,0.610126,"[20185, 1750, 12603, 3487, 25686]",0.0
6480,45,Private,39464,HS-grad,9,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0,0.0,0.981629,"[19131, 27917, 14598, 26509, 24094]",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7566,41,?,188436,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,20,Canada,0,0.4,0.731013,"[11192, 20953, 3297, 16755, 31253]",0.4
11449,30,Private,183017,HS-grad,9,Divorced,Machine-op-inspct,Own-child,White,Male,0,0,40,United-States,0,0.0,0.944934,"[17615, 11607, 28927, 5445, 4016]",0.0
4307,64,Private,186731,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0,0.0,0.926178,"[18940, 21568, 32558, 13133, 28726]",0.0
10742,31,Private,323985,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,1,0.6,0.925567,"[5725, 30730, 20488, 8423, 6686]",0.6


In [166]:
df_test['pred2']  =df_test['pred'].apply(lambda x: 1 if x > 0.5 else 0)

In [167]:
df_test[(df_test.education=='HS-grad')&(df_test.pred2!=df_test.truth)]#.tail(20)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,truth,pred,S_Score,neigbour,neigbours_mean,pred2
3590,61,Local-gov,205711,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,30,United-States,1,0.204507,0.696130,"[11518, 7851, 30502, 1206, 4125]",0.2,0
4428,28,Private,115438,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,50,United-States,1,0.355474,0.990838,"[14176, 29348, 8758, 10267, 5736]",0.4,0
8253,32,Private,153353,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,1,0.261744,0.995804,"[14114, 16615, 29202, 8646, 22491]",0.4,0
3388,59,Private,140363,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,42,United-States,1,0.327271,0.983765,"[32362, 21411, 452, 13803, 14198]",0.4,0
14625,51,Federal-gov,23698,HS-grad,9,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40,United-States,1,0.320384,0.767209,"[4789, 7280, 25336, 3894, 6484]",0.6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7955,41,Private,244522,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,7688,0,42,United-States,1,0.219554,0.964231,"[11266, 10604, 14336, 25477, 1233]",0.2,0
13192,35,Private,399601,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,1,0.382780,0.878483,"[16671, 3267, 22007, 4872, 26542]",0.4,0
2583,47,Self-emp-not-inc,185859,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,3103,0,60,United-States,1,0.262201,0.947662,"[22389, 23911, 21538, 31444, 30931]",0.2,0
7566,41,?,188436,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,0,20,Canada,0,0.571176,0.731013,"[11192, 20953, 3297, 16755, 31253]",0.4,1


In [103]:
df_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,truth
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0


In [168]:
df_train.iloc[[32362, 21411, 452, 13803, 14198]]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,truth
32362,63,Private,226422,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
21411,63,Private,206052,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
452,60,Private,245062,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,1
13803,63,Private,176696,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,1
14198,61,Private,273803,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0


In [133]:
df_test[df_test.education=='HS-grad'].tail(20)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,truth,pred,S_Score,neigbour,neigbours_mean
257,39,Private,281768,HS-grad,9,Never-married,Other-service,Not-in-family,Black,Female,0,0,40,United-States,0,0.219688,0.926003,"[5520, 1033, 27408, 21937, 28074]",0.0
13232,31,Private,114937,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0,0.242416,0.999938,"[20486, 17090, 5776, 20468, 11912]",0.2
7240,30,Private,233433,HS-grad,9,Never-married,Sales,Own-child,White,Female,0,0,45,United-States,0,0.219688,0.941921,"[23158, 4062, 225, 9395, 28249]",0.0
7651,23,Private,132220,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,United-States,0,0.230736,0.99773,"[19500, 9379, 8625, 26564, 26211]",0.0
14886,50,Private,39590,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,48,United-States,1,0.248904,0.984514,"[9809, 15437, 26423, 11775, 30165]",0.0
14537,40,Private,132222,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,40,United-States,0,0.219688,0.997643,"[9116, 2712, 19432, 18456, 8162]",0.0
11967,40,Private,77975,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0,0.248904,0.999881,"[22135, 17063, 15313, 25625, 25996]",0.4
3516,34,Private,205072,HS-grad,9,Never-married,Sales,Not-in-family,White,Male,0,0,50,United-States,0,0.219688,0.977819,"[16724, 25734, 18734, 23242, 15611]",0.0
7955,41,Private,244522,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,7688,0,42,United-States,1,0.322048,0.971924,"[26889, 4914, 7506, 844, 14336]",0.6
11063,28,Private,110408,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,45,United-States,0,0.230736,0.997554,"[13996, 19679, 5816, 4517, 10628]",0.0


### Explain predictions on your local machine

In [10]:
from interpret_community.common.constants import ShapValuesOutput, ModelTask
# 1. Using SHAP TabularExplainer
model_task = ModelTask.Classification
explainer = MimicExplainer(model, X_train_original, LGBMExplainableModel,
                           augment_data=True, max_num_of_augmentations=10,
                           features=features, classes=classes, model_task=model_task,
                           transformations=feat_pipe)

Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


### Generate global explanations
Explain overall model predictions (global explanation)

In [11]:
# Passing in test dataset for evaluation examples - note it must be a representative sample of the original data
# X_train can be passed as well, but with more examples explanations will take longer although they may be more accurate
global_explanation = explainer.explain_global(X_test_original)

Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


In [12]:
# Sorted SHAP values
print('ranked global importance values: {}'.format(global_explanation.get_ranked_global_values()))
# Corresponding feature names
print('ranked global importance names: {}'.format(global_explanation.get_ranked_global_names()))
# Feature ranks (based on original order of features)
print('global importance rank: {}'.format(global_explanation.global_importance_rank))

# Note: Do not run this cell if using PFIExplainer, it does not support per class explanations
# Per class feature names
print('ranked per class feature names: {}'.format(global_explanation.get_ranked_per_class_names()))
# Per class feature importance values
print('ranked per class feature values: {}'.format(global_explanation.get_ranked_per_class_values()))

ranked global importance values: [0.08851701600074025, 0.04423984342430703, 0.026865152824231475, 0.01771938463413675, 0.00964107575295292, 0.008636013016220096, 0.007873693349515506, 0.001759403431184331, 0.0005859671707447269, 0.0, 0.0, 0.0, 0.0, 0.0]
ranked global importance names: ['marital-status', 'education-num', 'capital-gain', 'age', 'hours-per-week', 'capital-loss', 'occupation', 'workclass', 'relationship', 'native-country', 'gender', 'race', 'education', 'fnlwgt']
global importance rank: [5, 4, 10, 0, 12, 11, 6, 1, 7, 13, 9, 8, 3, 2]
ranked per class feature names: [['marital-status', 'education-num', 'capital-gain', 'age', 'hours-per-week', 'capital-loss', 'occupation', 'workclass', 'relationship', 'native-country', 'gender', 'race', 'education', 'fnlwgt'], ['marital-status', 'education-num', 'capital-gain', 'age', 'hours-per-week', 'capital-loss', 'occupation', 'workclass', 'relationship', 'native-country', 'gender', 'race', 'education', 'fnlwgt']]
ranked per class featur

In [13]:
# Print out a dictionary that holds the sorted feature importance names and values
print('global importance rank: {}'.format(global_explanation.get_feature_importance_dict()))

global importance rank: {'marital-status': 0.08851701600074025, 'education-num': 0.04423984342430703, 'capital-gain': 0.026865152824231475, 'age': 0.01771938463413675, 'hours-per-week': 0.00964107575295292, 'capital-loss': 0.008636013016220096, 'occupation': 0.007873693349515506, 'workclass': 0.001759403431184331, 'relationship': 0.0005859671707447269, 'native-country': 0.0, 'gender': 0.0, 'race': 0.0, 'education': 0.0, 'fnlwgt': 0.0}


In [249]:
o = {'marital-status': 0.08851701600074025, 'education-num': 0.04423984342430703, 'capital-gain': 0.026865152824231475, 'age': 0.01771938463413675, 'hours-per-week': 0.00964107575295292, 'capital-loss': 0.008636013016220096, 'occupation': 0.007873693349515506, 'workclass': 0.001759403431184331, 'relationship': 0.0005859671707447269, 'native-country': 0.0, 'gender': 0.0, 'race': 0.0, 'education': 0.0, 'fnlwgt': 0.0}

In [256]:
o.keys()

dict_keys(['marital-status', 'education-num', 'capital-gain', 'age', 'hours-per-week', 'capital-loss', 'occupation', 'workclass', 'relationship', 'native-country', 'gender', 'race', 'education', 'fnlwgt'])

In [254]:
o.values()

dict_values([0.08851701600074025, 0.04423984342430703, 0.026865152824231475, 0.01771938463413675, 0.00964107575295292, 0.008636013016220096, 0.007873693349515506, 0.001759403431184331, 0.0005859671707447269, 0.0, 0.0, 0.0, 0.0, 0.0])

In [14]:
from sklearn.pipeline import Pipeline
dashboard_pipeline = Pipeline(steps=[('preprocess', feat_pipe), ('model', model)])

## Analyze
### Analyze model errors and explanations using Error Analysis dashboard

In [15]:
from raiwidgets import ErrorAnalysisDashboard

ImportError: cannot import name 'json' from 'itsdangerous' (D:\anaconda3\lib\site-packages\itsdangerous\__init__.py)

In [None]:
# Run error analysis on the full dataset with subsampled explanation data on 5k rows
# Note in this case we need to provide the true_y_dataset parameter matching the
# original full dataset
ErrorAnalysisDashboard(global_explanation, dashboard_pipeline, dataset=X_test_original_full,
                       true_y=y_test, categorical_features=categorical_features,
                       true_y_dataset=y_test_full)