Install fastai-v2 library

In [0]:
!pip install torch torchvision feather-format kornia pyarrow Pillow wandb nbdev --upgrade 
!pip install git+https://github.com/fastai/fastprogress  --upgrade
!pip install git+https://github.com/fastai/fastai2

# restart the runtime for the new dependencies to be used.

In [0]:
from fastai2.torch_basics import *
from fastai2.basics import *
from fastai2.tabular.core import *

In [0]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
print(df.shape)
df.head()

(32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [0]:
# The categorical and continous variables are listed here
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']

# Data transforms https://docs.fast.ai/tabular.transform.html
procs = [Categorify, FillMissing, Normalize]

# This is how to split the dataframe into train and valid (80/20)
splits = RandomSplitter()(range_of(df))
splits

((#26049) [32093,22445,21770,27363,23122,27286,17283,24158,6094,23409...],
 (#6512) [21654,30811,8445,18170,27843,7478,8646,24175,28172,16391...])

In [0]:
rang = range_of(df); 
print(rang[:10], rang[-10:])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] [32551, 32552, 32553, 32554, 32555, 32556, 32557, 32558, 32559, 32560]


In [0]:
# Now to create a TabularPandas object
# We pass in the dataframe, the preprocessor steps (procs), the categorical and continuous variables,
# the y variable, and how to split the data

to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names, 
                   y_names="salary", splits=splits)

# Along with this there is an optional is_y_cat, which will determine if you want a regression problem or not.

In [0]:
# The new object is a bit like an enhanced pandas dataframe

to.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary,age_na,fnlwgt_na,education-num_na
7581,-0.555449,3,0.349145,10,1.13499,5,11,2,5,Male,0,0,45,United-States,0,1,1,1
13915,-0.335292,6,-0.115782,16,-0.037215,3,5,1,5,Male,0,0,65,United-States,1,1,1,1
1545,-1.215921,5,-0.417781,10,1.13499,5,13,4,5,Male,0,0,40,United-States,0,1,1,1
24521,-0.335292,3,0.399924,12,-0.42795,3,11,1,5,Male,0,0,40,El-Salvador,0,1,1,1
30400,-1.509464,5,-0.93163,12,-0.42795,5,7,3,5,Female,0,0,28,United-States,0,1,1,1



We can create our DataLoaders (a train and a valid). One great reason to do this this way is we can pass in different batch sizes into each TabDataLoader, along with changing options like shuffle and drop_last.

Our train and validation data live in tp.train and tp.valid right now, so we specify that along with our options. When you make a training DataLoader, you want shuffle to be True and drop_last to be *True*


In [0]:
# Dataloaders

trn_dl = TabDataLoader(to.train, bs=64, shuffle=True, drop_last=True)
val_dl = TabDataLoader(to.valid, bs=128)

# Since our validation dataset is much smaller, use a larger batch size.

In [0]:
# Create a DataBunch

dbunch = DataBunch(trn_dl, val_dl)
dbunch.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,age_na,fnlwgt_na,education-num_na,age,fnlwgt,education-num,salary
0,Local-gov,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,False,False,False,45.0,56840.997695,9.0,>=50k
1,Private,7th-8th,Divorced,Machine-op-inspct,Unmarried,White,False,False,False,60.000001,165440.999843,4.0,<50k
2,Private,HS-grad,Divorced,Transport-moving,Not-in-family,White,False,False,False,48.0,155861.998801,9.0,<50k
3,Private,9th,Married-civ-spouse,Handlers-cleaners,Husband,White,False,False,False,35.0,186489.000155,5.0,<50k
4,State-gov,Doctorate,Married-civ-spouse,Prof-specialty,Husband,White,False,False,False,53.000001,50047.99968,16.0,>=50k
5,Private,12th,Never-married,Prof-specialty,Own-child,White,False,False,False,17.0,241185.001223,8.0,<50k
6,State-gov,Some-college,Divorced,Adm-clerical,Unmarried,White,False,False,False,46.0,314769.999762,10.0,<50k
7,Private,Some-college,Never-married,Farming-fishing,Not-in-family,White,False,False,False,20.0,39476.997041,10.0,<50k
8,Local-gov,Bachelors,Married-civ-spouse,Protective-serv,Husband,White,False,False,False,46.0,200946.999633,13.0,>=50k
9,Private,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,False,False,False,32.0,226882.999964,9.0,<50k


# Training

In [0]:
# Bring this in from fastai-v1
# https://github.com/fastai/fastai_dev/blob/master/dev/41_tabular_model.ipynb

def emb_sz_rule(n_cat): 
    "Rule of thumb to pick embedding size corresponding to `n_cat`"
    return min(600, round(1.6 * n_cat**0.56))

def _one_emb_sz(classes, n, sz_dict=None):
    "Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`."
    sz_dict = ifnone(sz_dict, {})
    n_cat = len(classes[n])
    sz = sz_dict.get(n, int(emb_sz_rule(n_cat)))  # rule of thumb
    return n_cat,sz

def get_emb_sz(to, sz_dict=None):
    "Get default embedding size from `TabularPreprocessor` `proc` or the ones in `sz_dict`"
    return [_one_emb_sz(to.procs.classes, n, sz_dict) for n in to.cat_names]

class TabularModel(Module):
    "Basic model for tabular data."
    def __init__(self, emb_szs, n_cont, out_sz, layers, ps=None, embed_p=0., y_range=None, use_bn=True, bn_final=False):
        ps = ifnone(ps, [0]*len(layers))
        if not is_listy(ps): ps = [ps]*len(layers)
        self.embeds = nn.ModuleList([Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(embed_p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        n_emb = sum(e.embedding_dim for e in self.embeds)
        self.n_emb,self.n_cont,self.y_range = n_emb,n_cont,y_range
        sizes = [n_emb + n_cont] + layers + [out_sz]
        actns = [nn.ReLU(inplace=True) for _ in range(len(sizes)-2)] + [None]
        _layers = [LinBnDrop(sizes[i], sizes[i+1], bn=use_bn and (i!=len(actns)-1 or bn_final), p=p, act=a)
                       for i,(p,a) in enumerate(zip(ps+[0.],actns))]
        self.layers = nn.Sequential(*_layers)
    
    def forward(self, x_cat, x_cont):
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            x_cont = self.bn_cont(x_cont)
            x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont
        x = self.layers(x)
        if self.y_range is not None:
            x = (self.y_range[1]-self.y_range[0]) * torch.sigmoid(x) + self.y_range[0]
        return x


## Building the Model

Call TabularModel and pass in an embedding matrix size, how many continuous variables we have, our number of outputs, and our layer sizes.



In [0]:
# Embedding matrix by doing get_emb_sz and passing in a TabularPandas

emb_szs = get_emb_sz(to)
emb_szs

[(10, 6), (17, 8), (8, 5), (16, 8), (7, 5), (6, 4), (2, 2), (2, 2), (3, 3)]

In [0]:
# Print number of continous variables 

cont_len = len(to.cont_names); cont_len

3

In [0]:
# Use a simple [200, 100] model, out is of size 2 as this is binary classification (above or below $50k)

net = TabularModel(emb_szs, cont_len, 2, [200,100])

net

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(10, 6)
    (1): Embedding(17, 8)
    (2): Embedding(8, 5)
    (3): Embedding(16, 8)
    (4): Embedding(7, 5)
    (5): Embedding(6, 4)
    (6): Embedding(2, 2)
    (7): Embedding(2, 2)
    (8): Embedding(3, 3)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): BatchNorm1d(46, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=46, out_features=200, bias=False)
      (2): ReLU(inplace=True)
    )
    (1): LinBnDrop(
      (0): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=200, out_features=100, bias=False)
      (2): ReLU(inplace=True)
    )
    (2): LinBnDrop(
      (0): Linear(in_features=100, out_features=2, bias=True)
    )
  )
)

In [0]:
# Creating an optimizer instance of our `Learner` object, and start training!

opt_func = partial(Adam, wd=0.01, eps=1e-5)
learn = Learner(dbunch, net, CrossEntropyLossFlat(), opt_func=opt_func, metrics=accuracy)

In [0]:
learn.fit(1)

(#5) [0,0.36167603731155396,0.3660365045070648,0.8318489193916321,00:09]


83.2% accuracy in one epoch

In [0]:
learn.fit(5)

(#5) [0,0.3605402708053589,0.35459697246551514,0.8419840335845947,00:09]
(#5) [1,0.34307861328125,0.3515145182609558,0.8409090638160706,00:09]
(#5) [2,0.34522944688796997,0.3478309214115143,0.8444410562515259,00:09]
(#5) [3,0.3538587689399719,0.3538019061088562,0.8363022208213806,00:09]
(#5) [4,0.35776010155677795,0.35092613101005554,0.8447481393814087,00:09]


In [0]:
# 24/11/2019 this function needs to be fixed in fastai library

@typedispatch
def show_results(x:Tabular, y:Tabular, samples, outs, ctxs=None, max_n=10, **kwargs):
    df = x.all_cols[:max_n]
    df[to.y_names+'_pred'] = y[to.y_names][:max_n].values
    display_df(df)

# learn.show_results()

# Permutation Importance

We train a full model and then shuffle the input columns and measure the change in accuracy afterwards. 

This will give us a tangeable viewing of how each variable in our tabular model affects the output (and how much it was needed)

In [0]:
class CalcPermutationImportance():
  def __init__(self, df:pd.DataFrame, learn:Learner, rounds:int, metric:callable):
    self.df, self.learn, self.rounds = df, learn, rounds
    self.learn.metrics = L(AvgMetric(metric))
    dbunch = learn.dbunch
    self.procs = dbunch.procs
    self.cats, self.conts = dbunch.cat_names, dbunch.cont_names
    self.cats = self.cats.filter(lambda x: '_na' not in x)
    self.y = dbunch.y_names
    self.results = self.calc_feat_importance()
    self.plot_importance(self.ord_dic_to_df(self.results))
    
  def calc_feat_importance(self):
    to_test = TabDataLoader(TabularPandas(self.df, self.procs, self.cats.copy(), self.conts, self.y))
    base_error = self.learn.validate(dl=to_test)[1]
    self.importance = {}
    pbar = master_bar(self.cats + self.conts, total=len(self.cats + self.conts))
    for col in pbar:
      self.importance[col] = self.calc_error(col)
      _ = progress_bar(range(1), display=False, parent=pbar)
      
    for key, value in self.importance.items():
        self.importance[key] = (base_error - value)/base_error
    return collections.OrderedDict(sorted(self.importance.items(), key=lambda kv: kv[1], reverse=True))
  
  def calc_error(self, sample_col:Str):
    df_temp = pd.concat([self.df]*self.rounds, ignore_index=True).copy()
    
    df_temp[sample_col] = np.random.permutation(df_temp[sample_col].values)
    to_test = TabDataLoader(TabularPandas(df_temp, self.procs, self.cats.copy(), self.conts, self.y))
    return self.learn.validate(dl=to_test)[1]
  
  def ord_dic_to_df(self, ord_dict:OrderedDict)->pd.DataFrame:
      return pd.DataFrame([[k, v] for k, v in ord_dict.items()], columns=['feature', 'importance'])
    
  def plot_importance(self, df:pd.DataFrame, limit=20, asc=False):
    df_copy = df.copy()
    df_copy['feature'] = df_copy['feature'].str.slice(0,25)
    ax = df_copy.sort_values(by='importance', ascending=asc)[:limit].sort_values(by='importance', ascending=not(asc)).plot.barh(x="feature", y="importance", sort_columns=True, figsize=(10, 10))
    for p in ax.patches:
        ax.annotate(f'{p.get_width():.4f}', ((p.get_width() * 1.005), p.get_y()  * 1.005))

In [0]:
# Take sample for feature importance (0.1 of dataset)

fi = df.sample(frac=0.1, random_state=1)
print(df.shape)

(32561, 15)


In [0]:
fi = df.iloc[29304:]

res = CalcPermutationImportance(fi, learn, 5, accuracy)

TypeError: ignored