In [60]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, ClassifierMixin

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split

from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, 
    recall_score, log_loss, roc_curve)

In [2]:
class MultipleOutputModel(BaseEstimator,ClassifierMixin):
    def __init__(self,*model_list): # no *args or **kargs
        ''' 
            Args:
                *modellist: list of (name, pred_type, model) tuples

                "name" is full description, "pred_type" is 'isCalc' or 'isPII' (e.g.)
                
                The only real methods that need to be defined are:
                    __init__, fit, predict 
                optional but very useful: predict_proba
                
        '''

        self.names   = [nm[0] for nm in model_list]
        self.model_tag = [nm[1] for nm in model_list]
        self.models  = [nm[2] for nm in model_list]
        
    def __repr__(self):
        return self.__class__.__name__ + ': ' + str([(n,m) for n, m in zip(self.names,self.models)])
    
    def __getitem__(self, key):
        if isinstance(key,int):
            return self.models[key]
        elif key in self.model_tag:
            return self.models[self.model_tag.index(key)]
        else:
            raise KeyError(f'bad key: {key}')

    def __keys__(self):
        for tag in self.models:
            yield tag

    def __items__(self):
        for tag, model in zip(self.model_tag, self.models):
            yield tag, model

    def items(self):
        return [(tag, model) for tag, model in zip(self.model_tag, self.models)]

    @property
    def tags(self):
        return [pt for pt in self.model_tag]
    
    @property
    def _estimator_type(self):
        if all(getattr(estimator, "_estimator_type", None) == "classifier" 
                for estimator in self.models):
            return "classifier"
        elif all(getattr(estimator, "_estimator_type", None) == "regressor" 
                for estimator in self.models):
            return "regressor"
        else:
            return None

    def add_model(self,name_tag_model):
        self.names.append(name_tag_model[0])
        self.model_tag.append(name_tag_model[1])
        self.models.append(name_tag_model[2])
        
    def add_models(self,name_tag_model_list):
        self.names.extend([nm[0] for nm in name_tag_model_list])
        self.model_tag.extend([nm[1] for nm in name_tag_model_list])
        self.models.extend([nm[2] for nm in name_tag_model_list])
    
    def fit(self, X, ylist, **kwargs):
        if isinstance(X,dict):
            assert len(ylist) == len(X), 'input data set and labels mismatched sizes'
            if isinstance(ylist,list):
                ylist = {k: ylist[i] for i,k in enumerate(X.keys())}
            data_keys = set(X.keys())
            model_keys = set(self.model_tag)
            assert len(data_keys.symmetric_difference(model_keys)) == 0, (
                'data and models do not share the same tags')
            return {pt: model.fit(X[pt], ylist[pt],**kwargs) 
                    for pt, model in zip(self.model_tag,self.models)}
        else:
            assert len(ylist) == len(self.models), (
                'number of models mismatched to the number of label sets')
            return {pt: model.fit(X,y,**kwargs) 
                    for pt, model, y in zip(self.model_tag,self.models,ylist)}

    def predict_proba(self, X, with_tag=True, **kwargs):
        ''' predict_proba

            the 1 - prob(class=0) is used to identify the probability of the class of interest
        '''
        if with_tag:
            if isinstance(X,dict):
                data_keys = set(X.keys())
                model_keys = set(self.model_tag)
                assert len(data_keys.symmetric_difference(model_keys)) == 0, (
                    'data and models do not share the same tags')
                return {pt: 1-model.predict_proba(X[pt],**kwargs)[:,0] 
                        for pt, model in zip(self.model_tag,self.models)}
            else:
                return {pt: 1-model.predict_proba(X,**kwargs)[:,0] 
                        for pt, model in zip(self.model_tag,self.models)}
        else:
            if isinstance(X,list) and len(X) == len(self.models):
                return [1-model.predict_proba(Xsub,**kwargs)[:,0] for model, Xsub in zip(self.models, X)]
            else:
                return [1-model.predict_proba(X,**kwargs)[:,0] for model in self.models]

    def predict(self, X, with_tag=True, **kwargs):
        if with_tag:
            if isinstance(X,dict):
                data_keys = set(X.keys())
                model_keys = set(self.model_tag)
                assert len(data_keys.symmetric_difference(model_keys)) == 0, (
                    'data and models do not share the same tags')
                return {pt: model.predict(X[pt],**kwargs) 
                        for pt, model in zip(self.model_tag,self.models)}
            else:
                return {pt: model.predict(X,**kwargs) 
                        for pt, model in zip(self.model_tag,self.models)}
        else:
            if isinstance(X,list) and len(X) == len(self.models):
                return [model.predict(Xsub,**kwargs) for model, Xsub in zip(self.models, X)]
            else:
                return [model.predict(X,**kwargs) for model in self.models]

In [11]:
df = pd.read_csv('../../AnalyticsNLP/formsML/tests/test_data/form_data.train.csv')
df.columns

Index(['Section', 'Field', 'FieldText', 'isCalc', 'isPII', 'isCode',
       'isCheckbox', 'isMultipleChoice'],
      dtype='object')

In [58]:
rng = np.random.RandomState(seed=42)
labels = pd.DataFrame(dict(label=2*df['isPII'].values+df['isCalc'].values))
X_train, X_test, y_train, y_test = train_test_split(
            df[['FieldText']],
            labels,
            test_size=0.2,
            stratify=labels, 
            random_state=rng)

train_labels = [(y_train == lbl).values  for k, lbl in zip(['isPII', 'isCalc'],[1,2])]
test_labels = [(y_test == lbl).values  for k, lbl in zip(['isPII', 'isCalc'],[1,2])]
# pd.concat([X_train, y_train],axis=1,ignore_index=True).head(), pd.concat([X_test, y_test],axis=1,ignore_index=True).head()
X_train.head(), y_train.head(), X_test.head(), y_test.head()

(                                             FieldText
 119            What type of entity is this shareholder
 134  Combination of Kentucky Schedule K-1, 1-5,8 an...
 51                             Prior year's tax credit
 32                        LLET paid on original return
 88                              Alternative allocation,
      label
 119      0
 134      0
 51       0
 32       0
 88       0,
                                FieldText
 141                          Rents/Lease
 27   Certified rehabilitation tax credit
 29                     Extension payment
 97                        Total payrolls
 123              Kentucky gross receipts,
      label
 141      0
 27       0
 29       0
 97       0
 123      0)

In [56]:
names = ['PII classifier', 'calculated field classifier']
tag_list = ['pii', 'icCalc']
cols2select = ['FieldText']

# Fit the CountVectorizer to the training data
preproc = CountVectorizer().fit(df['FieldText'])


models = [Pipeline([
         ("selector", ColumnTransformer([("selector", "passthrough", cols2select)], remainder="drop")),
         ('ravel', FunctionTransformer(np.ravel, check_inverse=False)),
         ('preproc', preproc),
         ('model', LogisticRegression(C=20))
        ]) for n in names]

mm = MultipleOutputModel(*[(n,t,m) for n,t,m in zip(names, tag_list, models)])

In [59]:
mm.fit(X_train, train_labels)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


{'pii': Pipeline(steps=[('selector',
                  ColumnTransformer(transformers=[('selector', 'passthrough',
                                                   ['FieldText'])])),
                 ('ravel',
                  FunctionTransformer(check_inverse=False,
                                      func=<function ravel at 0x7f053c3251f0>)),
                 ('preproc', CountVectorizer()),
                 ('model', LogisticRegression(C=20))]),
 'icCalc': Pipeline(steps=[('selector',
                  ColumnTransformer(transformers=[('selector', 'passthrough',
                                                   ['FieldText'])])),
                 ('ravel',
                  FunctionTransformer(check_inverse=False,
                                      func=<function ravel at 0x7f053c3251f0>)),
                 ('preproc', CountVectorizer()),
                 ('model', LogisticRegression(C=20))])}

In [71]:
metrics = (roc_auc_score, accuracy_score, precision_score, 
    recall_score, log_loss, roc_curve)

pred_trn = mm.predict(X_train)

pred_tst = mm.predict(X_test)

met = pd.DataFrame()
for f in metrics:
    for (tag,prd), lbl in zip(pred_trn.items(), train_labels):
        met['_'.join(['train', tag, f.__name__])] = [f(prd,lbl)]
    for (tag,prd), lbl in zip(pred_tst.items(), test_labels):
        met['_'.join(['test', tag, f.__name__])] = [f(prd,lbl)]
met[list(reversed(sorted([m for m in met.columns if 'roc' not in m])))].transpose()

Unnamed: 0,0
train_pii_recall_score,1.0
train_pii_precision_score,1.0
train_pii_log_loss,9.992007e-16
train_pii_accuracy_score,1.0
train_icCalc_recall_score,1.0
train_icCalc_precision_score,1.0
train_icCalc_log_loss,9.992007e-16
train_icCalc_accuracy_score,1.0
test_pii_recall_score,0.5
test_pii_precision_score,0.375
