# SETUP

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression

In [3]:
from epiml.loadepiml import LoadEpiml, load_search, save_search
from epiml.epimlsklearn.pnuwrapper import PNUWrapper
from epiml.epimlsklearn.nestedcross import NestedCV
from epiml.epimlsklearn.jsearchcv import JRandomSearchCV, extract_score_grid
from epiml.epimlsklearn.frankenscorer import FrankenScorer, extract_scores_from_nested

In [4]:
path = "C:\Data\membership14_final_0103.txt"
lc = LoadEpiml(path)

## Let's stratify 80/20 train test holdout for future use

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

## LASSO pipeline and model on labeled data (PN)

In [7]:
estimators = [('scaler', MaxAbsScaler()),
              ('clf',PNUWrapper(base_estimator=LogisticRegression(penalty='l1', C=2, random_state=732)))]
pipe = Pipeline(estimators)

In [8]:
pipe.fit(X_train.values, y_train.values)

Pipeline(steps=[('scaler', MaxAbsScaler(copy=True)), ('clf', PNUWrapper(base_estimator=LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=732, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
      num_unlabeled=0.0, pu_learning=False, random_state=None,
      threshold_set_pct=None))])

#### Scores of LASSO PN Baseline model

In [9]:
FrankenScorer()(pipe, X_test.values, y_test.values)

({'SCORE': 0.89976689976689972,
  'assumed_brier': 0.60031267154696111,
  'assumed_brier_neg': 0.60285852450719479,
  'assumed_f1': 0.0094622917866817001,
  'assumed_f1beta10': 0.3170392537956721,
  'confusion_matrix_lab': array([[161,  54],
         [ 32, 386]]),
  'confusion_matrix_un': array([[ 6939, 80783],
         [   32,   386]]),
  'labeled_acc': 0.86413902053712477,
  'labeled_avg_prec': 0.92563531297005985,
  'labeled_brier': 0.1040448755848036,
  'labeled_brier_neg': 0.17793964147052785,
  'labeled_brier_pos': 0.066036802222529167,
  'labeled_f1': 0.89976689976689972,
  'labeled_prec': 0.87727272727272732,
  'labeled_recall': 0.92344497607655507,
  'labeled_roc_auc': 0.83614109268944037,
  'pr_one_unlabeled': 0.92254333938999167,
  'pu_mix_assumed_f1beta10': 32.629912394133711,
  'pu_score': 0.92598701456650112},
 0.89976689976689972)

#### Coefficients of features for LASSO PN Baseline model

In [10]:
imp = pd.DataFrame(pipe.named_steps['clf'].base_estimator.coef_, columns=X_test.columns).T.sort_values(by=0, ascending=False)
imp[imp[0] != 0.0]

Unnamed: 0,0
DIAG_FLAG4_Sum,31.442457
ndc_cat54_Sum,7.552092
DIAG_FLAG69_Sum,5.673218
DIAG_FLAG5_Sum,5.406727
REVCODE_FLAG10_Sum,4.901706
REVCODE_FLAG19_Sum,4.670333
ndc_cat61_Sum,3.886519
DIAG_FLAG31_Sum,3.86993
DIAG_FLAG71_Sum,3.583528
ndc_cat58_Sum,3.45397


# Do 3-Fold Cross Validation of LASSO - PN

In [11]:
nested_lasso = NestedCV(pipe, scoring=FrankenScorer('assumed_f1beta10'), cv=3, random_state=None, use_same_random_state=False)
nested_lasso.score(X_train.values, y_train.values, n_jobs=-1, verbose=1)

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   13.8s finished


WARN: NestedCV.best_params_ set to None
WARN: NestedCV.best_idxs_ set to None


(0.32460658700734701, 0.32100177981184846, 0.31133339831375795)

In [12]:
extract_scores_from_nested(nested_lasso.test_score_datas_).mean().sort_index()

assumed_brier                                                       0.593557
assumed_brier_neg                                                   0.596055
assumed_f1                                                        0.00951951
assumed_f1beta10                                                    0.318981
confusion_matrix_lab       [[193.333333333, 94.0], [39.3333333333, 517.33...
confusion_matrix_un        [[9331.0, 107631.0], [39.3333333333, 517.33333...
fn_confusion_matrix_lab                                              39.3333
fn_confusion_matrix_un                                               39.3333
fp_confusion_matrix_lab                                                   94
fp_confusion_matrix_un                                                107631
labeled_acc                                                         0.842003
labeled_avg_prec                                                    0.911138
labeled_brier                                                       0.116227

# 3-Fold Nested Cross random search for pipline with PNU wrapper which will _undersample_ the unlabeled data by a massive amount

In [13]:
param_search = {'clf__base_estimator__C': sp.stats.expon(scale=4), 
               'clf__num_unlabeled': sp.stats.randint(low=1000, high=10000),
               'clf__base_estimator__class_weight': [None, 'balanced'],
               'clf__base_estimator__penalty': ['l1','l2'],
               'clf__pu_learning': [True, False]}

In [14]:
jsearch = JRandomSearchCV(pipe, param_search, n_iter=20, 
                            scoring=FrankenScorer('assumed_f1beta10'), n_jobs=-1, cv=5, verbose=1, random_state=42)

In [15]:
nested_cross = NestedCV(jsearch, scoring=FrankenScorer('assumed_f1beta10'), cv=3, random_state=731)

In [16]:
nested_cross.score(X_train.values, y_train.values, n_jobs=-1, verbose=1)

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  8.6min finished


(0.56329637821774114, 0.54711689168751887, 0.54173371291826544)

In [17]:
extract_scores_from_nested(nested_cross.test_score_datas_).mean().sort_index()

assumed_brier                                                       0.129326
assumed_brier_neg                                                   0.129123
assumed_f1                                                         0.0444195
assumed_f1beta10                                                    0.550716
confusion_matrix_lab       [[175.0, 112.333333333], [158.333333333, 398.3...
confusion_matrix_un        [[99981.0, 16981.0], [158.333333333, 398.33333...
fn_confusion_matrix_lab                                              158.333
fn_confusion_matrix_un                                               158.333
fp_confusion_matrix_lab                                              112.333
fp_confusion_matrix_un                                                 16981
labeled_acc                                                         0.679287
labeled_avg_prec                                                    0.841611
labeled_brier                                                       0.205563

In [18]:
extract_scores_from_nested(nested_cross.test_score_datas_).std().sort_index()

assumed_brier                0.001913
assumed_brier_neg            0.001965
assumed_f1                   0.000336
assumed_f1beta10             0.011223
fn_confusion_matrix_lab     10.785793
fn_confusion_matrix_un      10.785793
fp_confusion_matrix_lab     11.718931
fp_confusion_matrix_un     522.054595
labeled_acc                  0.026653
labeled_avg_prec             0.014438
labeled_brier                0.013491
labeled_brier_neg            0.022241
labeled_brier_pos            0.009056
labeled_f1                   0.020772
labeled_prec                 0.022466
labeled_recall               0.019556
labeled_roc_auc              0.030194
pr_one_unlabeled             0.004574
pu_mix_assumed_f1beta10      1.209920
pu_score                     0.088974
tn_confusion_matrix_lab     12.288206
tn_confusion_matrix_un     521.230275
tp_confusion_matrix_lab     11.150486
tp_confusion_matrix_un      11.150486
dtype: float64

In [19]:
[est.best_params_ for est in nested_cross.estimators_]

[{'clf__base_estimator__C': 3.9037130019710689,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 2610,
  'clf__pu_learning': True},
 {'clf__base_estimator__C': 1.485616929764805,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 9623,
  'clf__pu_learning': True},
 {'clf__base_estimator__C': 1.485616929764805,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__penalty': 'l1',
  'clf__num_unlabeled': 9623,
  'clf__pu_learning': True}]