# Setup, matplotlib inline, automatically reload libraries on every evaluation

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from loadcreon import LoadCreon
from creonmetrics import pu_scorer, prior_squared_error_scorer_015, brier_score_labeled_loss_scorer, \
    f1_assumed_scorer, f1_labeled_scorer, report_metrics, f1_assumed_beta10_scorer
from semisuperhelper import SemiSupervisedHelper
from pnuwrapper import PNUWrapper
from jeffsearchcv import JeffRandomSearchCV
from nestedcross import NestedCV
from frankenscorer import FrankenScorer, extract_scores_from_nested, extract_score_grid
from searchrf import save_search, load_search
from repeatedsampling import RepeatedRandomSubSampler

In [3]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone

In [5]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

# FIRST REPEATED SUB SAMPLER

## Set up randomized search parameters

In [6]:
rf_param_search = {'base_estimator__base_estimator__bootstrap': [True, False],
 'base_estimator__base_estimator__class_weight': [None,'balanced','balanced_subsample'],
 'base_estimator__base_estimator__criterion': ['gini','entropy'],
 'base_estimator__base_estimator__max_depth': [None] + list(range(2,100)),
 'base_estimator__base_estimator__max_features': ['sqrt','log2',None] + list(range(5,100)),
 'base_estimator__base_estimator__min_samples_leaf': [1,2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,45,50,75,100],
 'base_estimator__base_estimator__min_samples_split':[2,0.005,0.01,0.015,0.02,0.025,0.03,0.035,0.04,
                                                        0.045,0.05,0.07,0.09,0.1,0.12,0.15,0.17,0.2,0.25],
 'base_estimator__base_estimator__n_estimators': sp.stats.randint(low=10, high=500),
 'base_estimator__sample_imbalance': sp.stats.uniform(loc=0.1, scale=0.9)
                  }

## Set up PNU Wrapper with Random Forest, then JeffSearchCV, then NestedCV

In [12]:
rf = RandomForestClassifier(n_jobs=-1)
rep = RepeatedRandomSubSampler(base_estimator=rf, verbose=1)
pnu = PNUWrapper(base_estimator=rep, num_unlabeled=1.0)

In [13]:
jeffsearch = JeffRandomSearchCV(pnu, rf_param_search, n_iter=20, scoring=FrankenScorer(decision_score='assumed_f1beta10'),
                                n_jobs=-1, cv=3, verbose=1, pre_dispatch=8)

#### 3x3 (x20) nested cross validation

In [14]:
nested_cross = NestedCV(jeffsearch, scoring=FrankenScorer(decision_score='assumed_f1beta10'), cv=3, random_state=None)

## Score the nested cross - 180 models!

In [None]:
scores = nested_cross.score(X_train.values, y=y_train.values, verbose=100, pre_dispatch=8)

[CV]  ................................................................
Fitting 3 folds for each of 20 candidates, totalling 60 fits


## The scores of the 3 folds of the outer loop

In [None]:
scores

In [None]:
save_search(nested_cross, './res/nested_cross_repreated_rf_small_20170130.pkl')

In [None]:
extract_scores_from_nested(nested_cross.test_score_datas_).mean()

# Let's dive in and see the parameters for one of the best models

In [223]:
scores = extract_score_grid(next_ness.estimators_[2])

In [224]:
scores.mean_assumed_f1beta10_test.idxmax()

32

In [230]:
best_params = next_ness.estimators_[2].cv_results_['params'][32]
best_params

{'base_estimator__class_weight': 'balanced_subsample',
 'base_estimator__max_depth': 66,
 'base_estimator__max_features': None,
 'base_estimator__min_samples_leaf': 10,
 'base_estimator__min_samples_split': 0.005,
 'base_estimator__n_estimators': 363,
 'num_unlabeled': 12593,
 'pu_learning': True}

In [243]:
all_scores = [extract_score_grid(est) for est in next_ness.estimators_]

In [246]:
best_clf_idx = [s.mean_assumed_f1beta10_test.idxmax() for s in all_scores]

In [247]:
[est.cv_results_['params'][best_idx] for est, best_idx in zip(next_ness.estimators_, best_clf_idx)]

[{'base_estimator__class_weight': 'balanced',
  'base_estimator__max_depth': 59,
  'base_estimator__max_features': 50,
  'base_estimator__min_samples_leaf': 4,
  'base_estimator__min_samples_split': 0.01,
  'base_estimator__n_estimators': 131,
  'num_unlabeled': 12517,
  'pu_learning': False},
 {'base_estimator__class_weight': 'balanced',
  'base_estimator__max_depth': 80,
  'base_estimator__max_features': None,
  'base_estimator__min_samples_leaf': 4,
  'base_estimator__min_samples_split': 0.2,
  'base_estimator__n_estimators': 294,
  'num_unlabeled': 1807,
  'pu_learning': True},
 {'base_estimator__class_weight': 'balanced_subsample',
  'base_estimator__max_depth': 66,
  'base_estimator__max_features': None,
  'base_estimator__min_samples_leaf': 10,
  'base_estimator__min_samples_split': 0.005,
  'base_estimator__n_estimators': 363,
  'num_unlabeled': 12593,
  'pu_learning': True},
 {'base_estimator__class_weight': 'balanced',
  'base_estimator__max_depth': 54,
  'base_estimator__max

## Below is that specific model's test set scores in the inner loop.  These can't really be used in any comparison since they are biased but interesting to see.  Notice PU-score of 10.45, recall 0.8, precicion 0.92, pr_one_unlabeled = 5.84%.

In [226]:
scores.iloc[32][[c for c in scores.columns if 'test' in c and 'mean' in c]]

mean_labeled_f1_test                  0.859097
mean_labeled_recall_test              0.806113
mean_labeled_acc_test                 0.825762
mean_fp_confusion_matrix_lab_test         18.8
mean_labeled_brier_test               0.143333
mean_fn_confusion_matrix_un_test          51.8
mean_tp_confusion_matrix_lab_test        215.4
mean_tp_confusion_matrix_un_test         215.4
mean_tn_confusion_matrix_lab_test        119.2
mean_confusion_matrix_lab_test             NaN
mean_assumed_f1_test                  0.114144
mean_labeled_brier_neg_test           0.138661
mean_confusion_matrix_un_test              NaN
mean_assumed_f1beta10_test            0.719595
mean_labeled_prec_test                0.920053
mean_tn_confusion_matrix_un_test       52847.4
mean_labeled_roc_auc_test              0.83494
mean_assumed_brier_test              0.0532553
mean_pu_score_test                     10.4511
mean_labeled_brier_pos_test           0.145761
mean_fn_confusion_matrix_lab_test         51.8
mean_assumed_

In [189]:
extract_scores_from_nested(next_ness.test_score_datas_).mean()

fn_confusion_matrix_lab                                  62.4
tn_confusion_matrix_un                                64991.6
labeled_roc_auc                                      0.796337
confusion_matrix_lab           [[134.4, 38.0], [62.4, 271.6]]
labeled_acc                                          0.801717
confusion_matrix_un        [[64991.6, 5185.6], [62.4, 271.6]]
labeled_brier_neg                                    0.179147
labeled_prec                                         0.880482
fn_confusion_matrix_un                                   62.4
assumed_brier                                       0.0756377
assumed_brier_neg                                   0.0753697
labeled_brier_pos                                    0.131953
labeled_recall                                       0.813174
assumed_f1                                          0.0938105
fp_confusion_matrix_lab                                    38
assumed_f1beta10                                     0.705947
tp_confu

## Let's see what feature importance looks like for this specific estimator

In [235]:
best_clf = clone(next_ness.estimators_[2].estimator).set_params(**best_params)

In [236]:
best_clf.fit(X_train.values, y_train.values)

PNUWrapper(base_estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=66, max_features=None,
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=10, min_samples_split=0.005,
            min_weight_fraction_leaf=0.0, n_estimators=363, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
      num_unlabeled=12593, pu_learning=True, random_state=None,
      threshold_set_pct=None)

In [237]:
FrankenScorer()(best_clf, X_test.values, y_test.values)

({'SCORE': 0.84653465346534651,
  'assumed_brier': 0.057540639553026629,
  'assumed_brier_neg': 0.057233791695435653,
  'assumed_f1': 0.10958026273630246,
  'assumed_f1beta10': 0.72530656811691596,
  'confusion_matrix_lab': array([[167,  48],
         [ 76, 342]]),
  'confusion_matrix_un': array([[82240,  5482],
         [   76,   342]]),
  'labeled_acc': 0.80410742496050558,
  'labeled_avg_prec': 0.90758404312906693,
  'labeled_brier': 0.1442279224653652,
  'labeled_brier_neg': 0.18756734801773578,
  'labeled_brier_pos': 0.12193611267168178,
  'labeled_f1': 0.84653465346534651,
  'labeled_prec': 0.87692307692307692,
  'labeled_recall': 0.81818181818181823,
  'labeled_roc_auc': 0.79746300211416499,
  'pr_one_unlabeled': 0.062097889311712208,
  'pu_score': 10.130976977567888},
 0.84653465346534651)

In [238]:
pd.DataFrame(best_clf.base_estimator.feature_importances_, index=X_test.columns).sort_values(by=0, ascending=False)

Unnamed: 0,0
DIAG_FLAG4_Sum,0.4955561
DIAG_FLAG5_Sum,0.2300625
ndc_cat58_Sum,0.03582271
DIAG_FLAG6_Sum,0.01878767
age,0.01127403
CPT_FLAG43_Sum,0.00725422
ndc_cat87_Sum,0.006721163
CPT_FLAG48_Sum,0.006645633
ndc_cat85_Sum,0.005333897
DIAG_FLAG41_Sum,0.00515101
