In [None]:
# default_exp core

# 00_Core

> This module contains helper functions for preparing the data

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.test import test_eq

In [None]:
#export
from fastai.tabular import *

We'll use the `ADULT_SAMPLE` dataset for all of our examples. This dataset's aim is to identify if an individual makes above or below $50,000

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

Next let's name our variables

In [None]:
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Normalize, Categorify]

And decide how we want to split and generate our `DataLoader`!

In [None]:
dls = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(list(range(800,1000)))
                           .label_from_df(cols=dep_var)
                           .databunch())

Now let's train an initial model to use

In [None]:
learn = tabular_learner(dls, layers=[200,100], metrics=accuracy)
learn.fit(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.371138,0.381718,0.835,00:03


# _prepare_data
> Prepares some test data for how `SHAP` expects their input. You can pass in a `DataFrame` or `TabDataLoader`. If nothing is passed in, it will default to your validation data

In [None]:
#export
def _dataframe_of_dl(dl, col_names):
    "Takes a dataloader and column names and returns all of its content converted to a single dataframe."
    # for all elements in dataloader, get cont and cat columns, converts them to numpy, concat them
    matrix = [np.concatenate([x[0].to('cpu').numpy(), x[1].to('cpu').numpy()], axis=1) for x,y in iter(dl)]
    # concats all the rows before converting the result into a dataframe
    df = pd.DataFrame(np.concatenate(matrix), columns=col_names)
    return df

In [None]:
#export
def _prepare_data(learn:Learner, test_data:DataFrame=None, n_samples:int=128):
  "Prepares train and test data for `SHAP`, pass in a learner with optional data"
  col_names = learn.data.col_names
  if test_data is None: 
    # we use the validation dataset as test set
    if learn.data.valid_dl is None: raise Exception("Error: you tried to use Shap with neither valid dataset nor user defined test data. Please pass a dataframe to test_data")
    X_test = _dataframe_of_dl(learn.data.valid_dl, col_names)
    X_test = X_test.sample(min(n_samples, len(X_test)))
  else:
    # converts test_data dataframe to a processed tabular list
    test_data = TabularList.from_df(test_data, cat_names=learn.data.cat_names, cont_names=learn.data.cont_names, procs=learn.data.procs)
    # temporary adds test_data as a test dl to be able to turn it into a properly formated dataframe
    if learn.data.test_dl is not None: print("Warning: this function will erase the current test dataset!")
    learn.data.add_test(test_data)
    X_test = _dataframe_of_dl(learn.data.test_dl, col_names)
    learn.data.test_dl = None
  X_train = _dataframe_of_dl(learn.data.train_dl, col_names)
  return X_train, X_test

First, an example with a `DataFrame`

In [None]:
X_train, X_test = _prepare_data(learn, df.iloc[:100])
X_test.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num
0,5.0,8.0,3.0,0.0,6.0,5.0,1.0,0.763248,-0.838107,0.751083
1,5.0,13.0,1.0,5.0,2.0,5.0,1.0,0.396752,0.445849,1.533375
2,5.0,12.0,1.0,0.0,5.0,3.0,2.0,-0.043043,-0.886792,-0.031209
3,6.0,15.0,3.0,11.0,1.0,2.0,1.0,-0.043043,-0.728821,1.924521
4,7.0,6.0,3.0,9.0,6.0,3.0,2.0,0.250154,-1.018462,-0.031209


In [None]:
#hide
test_eq(len(X_test), 100)

We can see that we now are working with the transformed data! Lets try with no argument:

In [None]:
X_train, X_test = _prepare_data(learn)
X_test.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num
34,5.0,8.0,5.0,7.0,2.0,5.0,1.0,-1.215828,2.31285,0.751083
13,5.0,12.0,3.0,0.0,1.0,5.0,2.0,-0.702735,0.607545,-0.031209
33,3.0,13.0,1.0,0.0,5.0,5.0,1.0,0.396752,-0.518261,1.533375
78,5.0,7.0,6.0,7.0,3.0,5.0,2.0,-0.995931,1.168481,-0.031209
148,5.0,12.0,3.0,0.0,1.0,5.0,1.0,1.569538,0.524692,-0.422354


In [None]:
#hide
test_eq(len(X_test), 128)

In [None]:
#hide
# X_train, X_test = _prepare_data(learn, 'test')
# This should fail, just a test case

# _predict

Now we need to grab predictions based on what shap throws back. This is a basic function you can use to get your predictions. We can't include it in the library as we need access to your current `Learn`

In [None]:
#export
def _predict(learn:Learner, data:pd.DataFrame):
  "Predict function for some data on a fastai model"
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = learn.model.to(device)
  nb_cat_cols = len(learn.data.train_ds.x.cat_names)
  nb_cont_cols = len(learn.data.train_ds.x.cont_names)
  x_cat = torch.from_numpy(data[:, :nb_cat_cols]).to(device, torch.int64)
  x_cont = torch.from_numpy(data[:, -nb_cont_cols:]).to(device, torch.float32)
  with torch.no_grad():
    pred_probs = learn.model(x_cat, x_cont).cpu().numpy() # .detach().to('cpu').numpy()
  return pred_probs

`SHAP` will expect a numpy array for our data, so let's work with that and get some predictions!

In [None]:
data = X_test.iloc[:5].to_numpy()
pred_probs = _predict(learn, data)

In [None]:
#hide
test_eq(pred_probs.shape, (5,2))

Let's take a look at those predictions

In [None]:
pred_probs.shape

(5, 2)

In [None]:
pred_probs

array([[ 1.645038, -1.627416],
       [ 0.423796, -0.481057],
       [ 0.863326, -0.838185],
       [ 3.159658, -2.877828],
       [ 0.202847, -0.236409]], dtype=float32)

And now we can do whatever we need to!