In [None]:
# default_exp core

# 00_Core

> This module contains helper functions for preparing the data

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from fastai2.tabular.all import *

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

In [None]:
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]

In [None]:
splits = IndexSplitter(list(range(800,1000)))(range_of(df))
to = TabularPandas(df, procs, cat_names, cont_names, y_names="salary", splits=splits)
dls = to.dataloaders()

First we train our initial model

In [None]:
learn = tabular_learner(dls, layers=[200,100], metrics=accuracy)
learn.fit(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.374005,0.399195,0.825,00:08


Now we begin using shap

In [None]:
#export
import shap

We want to get our predictions in check for our model. We do this by passing in `SHAP` data. This has an option to pass in a test `DataFrame` or `DataLoader`. If none passed in, it will assume the validation dataset

In [None]:
#export
def _prepare_data(learn:Learner, test_data=None):
  "Prepares train and test data for `SHAP`, pass in a learner with optional data"
  dtype = ''
  if isinstance(test_data, pd.DataFrame): dtype = 'pandas'
  elif isinstance(test_data, TabDataLoader): dtype = 'dl'
  elif test_data is None: dtype = 'train'
  else: raise ValueError('Input is not supported. Please use either a `DataFrame` or `TabularDataLoader`')
  cols = learn.dls.cat_names + learn.dls.cont_names
  X_train_cat, X_train_cont, _ = learn.dls.one_batch()
  X_train = [X_train_cat, X_train_cont]
  X_train = pd.DataFrame(np.concatenate([v.to('cpu').numpy() for v in X_train], axis=1), columns=cols)
  if dtype == 'pandas':
    dl = learn.dls.test_dl(test_data)
  elif dtype=='dl':
    dl = test_data
  else:
    dl = learn.dls[1]
    if len(dl) * learn.dls.bs > 256: 
      test_data = dl.dataset.all_cols.sample(256, replace=False)
      dl = learn.dls.test_dl(test_data)
  X_test = tensor(dl.cats).long(),tensor(dl.conts).float()
  X_test = pd.DataFrame(np.concatenate([v.to('cpu').numpy() for v in X_test], axis=1), columns=cols)
  return X_train, X_test

In [None]:
dl = learn.dls.test_dl(df.iloc[:100])
X_train, X_test = _prepare_data(learn, dl)
test_eq(len(X_train), 64)
test_eq(len(X_test), 100)

In [None]:
X_train, X_test = _prepare_data(learn, df.iloc[:100])
test_eq(len(X_train), 64)
test_eq(len(X_test), 100)

In [None]:
X_train, X_test = _prepare_data(learn)
test_eq(len(X_train), 64)
test_eq(len(X_test), 200)

In [None]:
X_train, X_test = _prepare_data(learn, 'test')

ValueError: ignored

Now we need to grab predictions based on what shap throws back. This is a basic function you can use to get your predictions. We can't include it in the library as we need access to your current `Learn`

In [None]:
#export
def _predict(learn:TabularLearner, data:np.array):
  "Predict function for some data on a fastai model"
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = learn.model.to(device)
  dl = learn.dls[0]
  nb_cat_cols = len(dl.dataset.cat_names)
  nb_cont_cols = len(dl.dataset.cont_names)
  x_cat = torch.from_numpy(data[:, :nb_cat_cols]).to(device, torch.int64)
  x_cont = torch.from_numpy(data[:, -nb_cont_cols:]).to(device, torch.float32)
  with torch.no_grad():
    pred_probs = learn.model(x_cat, x_cont).cpu().numpy()
  return pred_probs

In [None]:
data = X_train.iloc[:5].to_numpy()
pred_probs = _predict(learn, data)
test_eq(pred_probs.shape, (5,2))

And now we can do whatever we need to.