In [1]:
import dice_ml
from dice_ml.utils import helpers
import pandas as pd

In [15]:
adult_income = pd.read_csv('./data/adult.csv', na_values='?')
adult_income.head()


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [16]:
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

numerical_var = [col for col in adult_income.columns if adult_income[col].dtype != object]
categorical_var = [col for col in adult_income.columns if adult_income[col].dtype == object]
print(numerical_var)
print(categorical_var)

adult_income['income'] = adult_income['income'].apply(lambda x:x.replace("<=50K", "0"))
adult_income['income'] = adult_income['income'].apply(lambda x:x.replace(">50K", "1"))
adult_income['income'] = adult_income['income'].astype(int)



['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']


In [19]:
# age
for i in adult_income:
    adult_income.loc[(adult_income['age'] > 16) & (adult_income['age'] <= 25), 'age'] = 1
    adult_income.loc[(adult_income['age'] > 25) & (adult_income['age'] <= 32), 'age'] = 2
    adult_income.loc[(adult_income['age'] > 32) & (adult_income['age'] <= 40), 'age'] = 3
    adult_income.loc[(adult_income['age'] > 40) & (adult_income['age'] <= 50), 'age'] = 4
    adult_income.loc[adult_income['age'] > 50, 'age'] = 5

In [20]:
# education
adult_income['education'] = adult_income['education'].apply(lambda x: 'School' if x == '11th' or x == '7th-8th' or x == '10th' 
                                              or x == '5th-6th' or x == '9th' or x == '12th' or x == '1st-4th' 
                                              or x == 'Preschool' else x)
adult_income['education'] = adult_income['education'].apply(lambda x: 'Associate' if x == 'Assoc-acdm' or x == 'Assoc-voc' else x)
education_map = {'School':1,
             'HS-grad':2,
             'Some-college':3,
             'Bachelors':4,
             'Prof-school':5,
             'Associate':6,
             'Masters':7,
             'Doctorate':8}
adult_income['education'] = adult_income['education'].map(education_map)

In [22]:
# Converting ? to nan
adult_income[adult_income == '?'] = np.nan

In [28]:
col_with_symbol = ['workclass', 'occupation', 'native-country']
# Imputing ? with mode
for col in col_with_symbol:
    adult_income[col].fillna(adult_income[col].mode()[0], inplace=True)

In [29]:
y = adult_income['income']
X = adult_income.drop('income', axis = 1)

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


X = adult_income.drop('income', axis=1)
y = adult_income['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

category_var = [col for col in X.columns if X[col].dtypes == object]
category_var

['workclass',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'native-country']

In [31]:
numeric_var = [col for col in X.columns if X[col].dtypes != object]
numeric_var

['age',
 'fnlwgt',
 'education',
 'educational-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [32]:
import category_encoders as ce
one_hot = ce.OneHotEncoder(cols = category_var, handle_unknown = 'ignore')

# Creating dataframe for categorical variables which converted to one hot encoded variables.
X_train_one_hot = pd.DataFrame(one_hot.fit_transform(X_train))
X_test_one_hot = pd.DataFrame(one_hot.transform(X_test))

X_train_one_hot.index = X_train.index
X_test_one_hot.index = X_test.index

num_X_train = X_train[numeric_var]
num_X_test = X_test[numeric_var]

# Joining numerical and one hot encoded variables to create our final X_train and X_test.
X_train_new = pd.concat([num_X_train, X_train_one_hot], axis = 1)
X_test_new = pd.concat([num_X_test, X_test_one_hot], axis = 1)

# Scaling our records into standard range of 0 and 1.
scaler = StandardScaler()

X_train_new = scaler.fit_transform(X_train_new)
X_test_new = scaler.transform(X_test_new)

In [33]:
# Logistic Regression
model_logr = LogisticRegression(random_state = 1)
model_logr.fit(X_train_new, y_train)
pred_logr = model_logr.predict(X_test_new)

In [36]:

data = dice_ml.Data(dataframe = adult_income,
    continuous_features=['age', 'hours-per-week'],
    outcome_name = 'income'
    )
m = dice_ml.Model(model=model_logr, backend="sklearn")

# model = dice_ml.Model(model_path = dice_ml.utils.helpers.get_adult_income_modelpath())

explanation = dice_ml.Dice(data, m)

In [47]:
adult_income.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,1,Private,226802,1,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,3,Private,89814,2,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,2,Local-gov,336951,6,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,4,Private,160323,3,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,1,Private,103497,3,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,30,United-States,0


In [48]:
query_instance = {'age':22,
    'workclass':'Private',
    'marital-status':'Single',
    'occupation':'Service',
    'race': 'White',
    'gender':'Female',
    'hours-per-week': 45}

In [49]:
# Generate counterfactual examples
dice_explanation = explanation.generate_counterfactuals(query_instance, total_CFs=4, desired_class="opposite")
# Visualize counterfactual explanation
# dice_explanation.visualize_as_dataframe()

  0%|          | 0/7 [00:00<?, ?it/s]


AttributeError: 'str' object has no attribute 'columns'