In [0]:
from fastai.tabular import *

In [0]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

In [0]:
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Categorify, Normalize]

## Splitting

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
train, test = train_test_split(df, test_size=0.1)

In [0]:
data = (TabularList.from_df(train, path=path, cat_names=cat_names, 
                            cont_names=cont_names, procs=procs)
                           .split_by_rand_pct(0.2)
                           .label_from_df(cols=dep_var)
                           .databunch())

In [0]:
testing = (TabularList.from_df(test, path=path, cat_names=cat_names, 
                            cont_names=cont_names, procs=procs,
                              processor = data.processor)
                           .split_none()
                           .label_from_df(cols=dep_var)
                           .databunch())

## Train

In [0]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)

In [9]:
learn.fit(1)

epoch,train_loss,valid_loss,accuracy,time
0,0.372707,0.353058,0.836007,00:04


## Feature Selection

One thing I had wrong before: Feature Importance should be calculated using the **test** set. As such, we have a new function:

In [0]:
from tqdm import tqdm

In [0]:
def myfeature_importance(learn:Learner, test:TabularDataBunch): 
    pd.options.mode.chained_assignment = None
    # based on: https://medium.com/@mp.music93/neural-networks-feature-importance-with-fastai-5c393cf65815
    data = learn.data.train_ds.x
    cat_names = data.cat_names
    cont_names = data.cont_names
    loss0=np.array([learn.loss_func(learn.pred_batch(batch=(x,y.to("cpu"))), y.to("cpu")) for x,y in iter(test.train_dl)]).mean()
    #The above gives us our ground truth for our validation set
    fi=dict()
    types=[cat_names, cont_names]
    with tqdm(total=len(data.col_names)) as pbar:
      for j, t in enumerate(types): # for all of cat_names and cont_names
        for i, c in enumerate(t):
          loss=[]
          for x,y in (iter(learn.data.valid_dl)): # for all values in validation set
            col=x[j][:,i] # select one column of tensors
            idx = torch.randperm(col.nelement()) # generate a random tensor
            x[j][:,i] = col.view(-1)[idx].view(col.size()) # replace the old tensor with a new one
            y=y.to('cpu')
            loss.append(learn.loss_func(learn.pred_batch(batch=(x,y)), y))
          pbar.update(1)
          fi[c]=np.array(loss).mean()-loss0 # average loss - overall loss. Higher number = greater loss?
    d = sorted(fi.items(), key=lambda kv: kv[1], reverse=True)
    
    df = pd.DataFrame({'Variable': [l for l, v in d], 'Importance': np.log1p([v for l, v in d])})
    df['Type'] = ''
    for x in range(len(df)):
      if df['Variable'].iloc[x] in cat_names:
        df['Type'].iloc[x] = 'categorical'
      if df['Variable'].iloc[x] in cont_names:
        df['Type'].iloc[x] = 'continuous'
    return df

In [16]:
myfeature_importance(learn, testing)

100%|██████████| 10/10 [00:06<00:00,  1.42it/s]


Unnamed: 0,Variable,Importance,Type
0,education-num,0.030869,continuous
1,marital-status,0.029844,categorical
2,age,0.018415,continuous
3,occupation,0.017222,categorical
4,relationship,0.013191,categorical
5,workclass,0.008279,categorical
6,education,0.006343,categorical
7,fnlwgt,0.005771,continuous
8,race,0.005666,categorical
9,education-num_na,0.005403,categorical


## Predictions

In [17]:
learn.get_preds()

[tensor([[0.9672, 0.0328],
         [0.9921, 0.0079],
         [0.6003, 0.3997],
         ...,
         [0.9840, 0.0160],
         [0.8426, 0.1574],
         [0.3330, 0.6670]]), tensor([0, 0, 0,  ..., 0, 0, 1])]

In [19]:
learn.get_preds(ds_type=DatasetType.Valid)

[tensor([[0.9672, 0.0328],
         [0.9921, 0.0079],
         [0.6003, 0.3997],
         ...,
         [0.9840, 0.0160],
         [0.8426, 0.1574],
         [0.3330, 0.6670]]), tensor([0, 0, 0,  ..., 0, 0, 1])]

In [21]:
learn.predict(df.iloc[0])

(Category >=50k, tensor(1), tensor([0.3669, 0.6331]))

In [22]:
learn.validate()

[0.35305777, tensor(0.8360)]

### On another labeled set revisited

In [23]:
learn.get_preds(testing.train_dl) ### WRONG

[tensor([[1.9956e-01, 8.0044e-01],
         [6.9816e-01, 3.0184e-01],
         [9.4956e-01, 5.0437e-02],
         ...,
         [5.1113e-01, 4.8887e-01],
         [9.9934e-01, 6.5816e-04],
         [6.5955e-01, 3.4045e-01]]), tensor([0, 1, 0,  ..., 1, 0, 1])]

In [0]:
learn.data.valid_dl = testing.train_dl

In [25]:
learn.validate()

[0.34789792, tensor(0.8425)]

In [26]:
learn.get_preds()[1]

tensor([0, 0, 0,  ..., 0, 0, 1])

Each of these go to a mapping found here

In [27]:
learn.data.c2i

{'<50k': 0, '>=50k': 1}