In [None]:
# default_exp tabular.core

# tabular.core
> This module contains helper functions for using in various interpretation classes for Pavel's interpretation modules

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
from fastai2.tabular.all import *

In [None]:
#export
from pickle import load, dump
from bz2 import BZ2File
import pandas as pd
import pickle

In [None]:
#export
def _merge_tfms(means, stds):
    "merge mean and std to singular dictionary"
    names = means.keys()
    merged = {}
    for n in names:
        mean, std = means[n], stds[n]
        merged[n] = {'mean':mean, 'std':std}
    return merged

In [None]:
#export
def _get_procs(dls):
    "Extract tabular `procs` from `dls` and get indicies of `NumPy`"
    fm = dls.procs.fill_missing
    na_dict = getattr(fm, 'na_dict')
    add_col = getattr(fm, 'add_col')
    fm = {'na_dict':na_dict, 'add_col':add_col}

    norm = dls.procs.normalize
    means = getattr(norm, 'means').to_dict()
    stds = getattr(norm, 'stds').to_dict()
    norm = _merge_tfms(means, stds)
    
    cat_names = dls.cat_names
    cont_names = dls.cont_names
    
    name2idx = {name:n for n,name in enumerate(dls.dataset) if name in cat_names or name in cont_names}
    idx2name = {v: k for k, v in name2idx.items()}
    
    cat_idxs = {name2idx[name]:name for name in cat_names}
    cont_idxs = {name2idx[name]:name for name in cont_names}
    names = {'cats':cat_idxs, 'conts':cont_idxs}
    
    categorize = dls.procs.categorify.classes.copy()
    for i,c in enumerate(categorize):
        categorize[c] = {a:b for a,b in enumerate(categorize[c])}
        categorize[c] = {v: k for k, v in categorize[c].items()}
        categorize[c].pop('#na#')
        categorize[c][np.nan] = 0
    
    try:
        classes = dls.categorize.vocab
    except:
        classes = ['regression']
    return {'FillMissing':fm, 'Normalize':norm, 'Categorize':categorize, 'Inputs':names, 'Outputs':classes}

In [None]:
#export
@patch
def to_fastinference(x:TabularLearner, fname='export', path=Path('.')):
    "Export data for `fastinference_onnx` or `_pytorch` to use"
    procs = _get_procs(x.dls)
    with open(path/f'{fname}.pkl', 'wb') as handle:
        picle.dump(procs, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from nbdev.showdoc import *

In [None]:
doc(TabularLearner.to_fastinference)

In [None]:
import pandas as pd
df = pd.read_csv('/home/ml1/.fastai/data/adult_sample/adult.csv')
cat_names = ['workclass', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['fnlwgt', 'education-num']
splits = RandomSplitter()(range_of(df))
procs = [Categorify, FillMissing, Normalize]
y_names = ['age']

to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names,
                   y_names=y_names, splits=splits)
dls = to.dataloaders()

In [None]:
dls.show_batch()

Unnamed: 0,workclass,marital-status,occupation,relationship,race,education-num_na,fnlwgt,education-num,age
0,State-gov,Married-civ-spouse,Other-service,Wife,Black,False,72505.9938,9.0,46.0
1,Private,Never-married,Transport-moving,Own-child,White,False,117778.997256,9.0,32.0
2,Private,Divorced,Adm-clerical,Not-in-family,White,False,137898.001144,9.0,28.0
3,Private,Never-married,Tech-support,Own-child,White,False,136984.999046,12.0,29.0
4,Self-emp-inc,Married-civ-spouse,Exec-managerial,Husband,White,False,31716.995023,9.0,28.0
5,Private,Never-married,Prof-specialty,Not-in-family,White,False,119410.997738,13.0,30.0
6,Self-emp-not-inc,Separated,Machine-op-inspct,Not-in-family,White,False,115214.997598,6.0,38.0
7,Private,Married-civ-spouse,Exec-managerial,Husband,White,False,211075.000053,9.0,74.0
8,Private,Married-civ-spouse,Tech-support,Husband,White,False,133336.001152,9.0,51.0
9,Private,Separated,Machine-op-inspct,Not-in-family,White,False,268357.998891,7.0,46.0


In [None]:
dls.y_names

(#1) ['age']

In [None]:
dls.vocab

AttributeError: vocab

In [None]:
dls.vocab

(#2) ['<50k','>=50k']

In [None]:
#export
class Interpret():
    def __init__(self, learn, df):
        """
        MasterClass what knows how to deal with learner and dataframe
        Now for classification only
        """
        self.learn = learn
        self.df = df

    def _predict_row(self, row):
        """
        Wrapper for prediction on a single row
        """
        learn = self.learn
        return float(learn.get_preds(dl=learn.dls.test_dl(pd.DataFrame([row])))[0][0][0])

    def _predict_df(self, df=None, is_ret_actls=False):
        """
        returns predictions of df with certain learner
        """
        df = df if isNotNone(df) else self.df
        if (is_ret_actls == False):
            return np.array(self.learn.get_preds(dl=self.learn.dls.test_dl(df))[0].T[0])
        else:
            out = self.learn.get_preds(dl=self.learn.dls.test_dl(df))
            return np.array(out[0].T[0]), np.array(out[1].T[0])

    def _convert_dep_col(self, dep_col, use_log=False):
        '''
        Converts dataframe column, named "depended column", into tensor, that can later be used to compare with predictions.
        Log will be applied if use_log is set True
        '''
        actls = self.df[dep_col].T.to_numpy()[np.newaxis].T.astype('float32')
        actls = np.log(actls) if (use_log == True) else actls
        return torch.tensor(actls)

    def _list_to_key(self, field):
        """
        Turns unhashable list of strings to hashable key
        """
        return f"{field}" if isinstance(field, str) else ', '.join(f"{e}" for e in field)

    def _sv_var(self, var, name, path: Path = None):
        "Save variable as pickle object to path with name"
        f = open(path / f"{name}.pkl", "wb")
        dump(var, f)
        f.close()

    def _ld_var(self, name, path: Path = None):
        "Returns a pickle object from path with name"

        f = open(path / f"{name}.pkl", "rb")
        var = load(f)
        f.close()
        return var

    def _calc_loss(self, pred, targ):
        '''
        Calculates error from predictions and actuals with a learner loss function
        '''
        func = self.learn.loss_func
        return func(torch.tensor(pred, device=default_device()), torch.tensor(targ, device=default_device()))

    def _calc_error(self, df=None):
        '''
        Wrapping function to calculate error for new dataframe on existing learner (learn.model)
        See following functions' docstrings for details
        '''
        df = df if isNotNone(df) else self.df
        preds, actls = self._predict_df(df=df, is_ret_actls=True)
        error = self._calc_loss(pred=preds, targ=actls)
        return float(error)

    def _get_cat_columns(self, is_wo_na=False):
        if (is_wo_na == False):
            return self.learn.dls.cat_names
        else:
            return self.learn.dls.cat_names.filter(lambda x: x[-3:] != "_na")

    def _get_cont_columns(self):
        return self.learn.dls.cont_names

    def _get_all_columns(self):
        return self._get_cat_columns() + self._get_cont_columns()

    def _get_dep_var(self):
        return self.learn.dls.y_names[0]

In [None]:
#export
def sv_var(var, name, path, bzipped=False):
    "Save variable as pickle object to path with name"
    if (bzipped == False):
        f = open(path/f"{name}.pkl","wb")
    else:
        f = BZ2File(path/f"{name}.pkl.bz2", "wb")
    dump(var, f)
    f.close()

def ld_var(name, path, bzipped=False):
    "Returns a pickle object from path with name"
    if (bzipped == False):
        f = open(path/f"{name}.pkl","rb")
    else:
        f = BZ2File(path/f"{name}.pkl.bz2","rb")
    var = load(f)
    f.close()
    return var

In [None]:
#export
def _list_diff(list_1:list, list_2:list)->list:
    "Difference between first and second lists"
    diff = set(list_1) - set(list_2)
    return [item for item in list_1 if item in diff]

def list_diff(list1, list2, *args)->list:
    "Difference between first and any number of lists"
    diff = _list_diff(list1, list2)
    for arg in args:
        diff = _list_diff(diff, arg)
    return diff

In [None]:
list_1 = ["1", 2, 3, 4, "5", 77, -7]
list_2 = [3, "5"]
list_3 = [4, -7]
list_4 = ["bla-bla", 0, 77]

In [None]:
list_diff(list_1, list_2, list_3, list_4)

['1', 2]

In [None]:
test_eq(list_diff(list_1, list_2, list_3, list_4), ['1', 2])

In [None]:
#export
def which_elms(values:list, in_list:list)->list:
    '''
    Just returns elements from values that are in list in_list
    '''
    return [x for x in values if (x in in_list)]

In [None]:
which_elms(list_1, list_2)

[3, '5']

In [None]:
which_elms(list_1, list_4)

[77]

In [None]:
test_eq(which_elms(list_1, list_2), [3, '5'])
test_eq(which_elms(list_1, list_4), [77])

In [None]:
#export
def is_in_list(values:list, in_list:list)->bool:
    '''
    Just returns is any of the elements from values is in list in_list
    '''
    if (len(which_elms(values, in_list)) > 0):
        return True
    else:
        return False

In [None]:
is_in_list(list_1, ["bla-bla", 0, 77])

True

In [None]:
is_in_list(list_1, ["bla-bla", 0])

False

In [None]:
test_eq(is_in_list(list_1, ["bla-bla", 0, 77]), True)
test_eq(is_in_list(list_1, ["bla-bla", 0]), False)

In [None]:
#export
def listify(p=None, match=None):
    "Make `p` listy and the same length as `match`."
    if p is None: p=[]
    elif isinstance(p, str): p = [p]
    else:
        try: a = len(p)
        except: p = [p]
    n = match if type(match)==int else len(p) if match is None else len(match)
    if len(p)==1: p = p * n
    assert len(p)==n, f'List len mismatch ({len(p)} vs {n})'
    return list(p)

In [None]:
test_eq(listify(None),[])
test_eq(listify([1,2,3]),[1,2,3])
test_eq(listify(1,match=[1,2,3]),[1,1,1])

In [None]:
#export
def isNone(cond):
    return cond is None

def isNotNone(cond):
    return cond is not None

In [None]:
test_eq(isNone(None),True)
test_eq(isNone("None"),False)
test_eq(isNone(""),False)
test_eq(isNone(0),False)

In [None]:
test_eq(isNotNone(None),False)
test_eq(isNotNone("None"),True)
test_eq(isNotNone(""),True)
test_eq(isNotNone(0),True)