# Setup, matplotlib inline, automatically reload libraries on every evaluation

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [8]:
from loadcreon import LoadCreon
from creonmetrics import pu_scorer, prior_squared_error_scorer_015, brier_score_labeled_loss_scorer, \
    f1_assumed_scorer, f1_labeled_scorer, report_metrics, f1_assumed_beta10_scorer
from semisuperhelper import SemiSupervisedHelper
from pnuwrapper import PNUWrapper
from jeffsearchcv import JeffRandomSearchCV
from nestedcross import NestedCV, rerun_nested_for_scoring
from frankenscorer import FrankenScorer, extract_scores_from_nested, extract_score_grid
from searchrf import save_search, load_search
from repeatedsampling import RepeatedRandomSubSampler

In [3]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone

In [5]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

# FIRST REPEATED SUB SAMPLER

## Set up randomized search parameters

In [16]:
rf_param_search = {'base_estimator__base_estimator__bootstrap': [True, False],
 'base_estimator__base_estimator__class_weight': [None,'balanced','balanced_subsample'],
 'base_estimator__base_estimator__criterion': ['gini','entropy'],
 'base_estimator__base_estimator__max_depth': [None] + list(range(2,100)),
 'base_estimator__base_estimator__max_features': ['sqrt','log2',None] + list(range(5,100)),
 'base_estimator__base_estimator__min_samples_leaf': [1,2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,45,50,75,100],
 'base_estimator__base_estimator__min_samples_split':[2,0.005,0.01,0.015,0.02,0.025,0.03,0.035,0.04,
                                                        0.045,0.05,0.07,0.09,0.1,0.12,0.15,0.17,0.2,0.25],
 'base_estimator__base_estimator__n_estimators': sp.stats.randint(low=10, high=300),
 'base_estimator__sample_imbalance': sp.stats.uniform(loc=0.1, scale=0.667),
 'pu_learning': [True, False],
                  }

## Set up PNU Wrapper with Random Forest, then JeffSearchCV, then NestedCV

In [17]:
rf = RandomForestClassifier(n_jobs=-1)
rep = RepeatedRandomSubSampler(base_estimator=rf, verbose=1)
pnu = PNUWrapper(base_estimator=rep, num_unlabeled=1.0)

In [18]:
jeffsearch = JeffRandomSearchCV(pnu, rf_param_search, n_iter=100,
                                scoring=FrankenScorer(decision_score='pu_mix_assumed_f1beta10'),
                                n_jobs=-1, cv=3, verbose=1, pre_dispatch=8)

#### 3x3 (x100) nested cross validation

In [19]:
nested_cross = NestedCV(jeffsearch, scoring=FrankenScorer(decision_score='pu_mix_assumed_f1beta10'), cv=3,
                        random_state=74, use_same_random_state=True)

## Score the nested cross - 900 models!

In [20]:
scores = nested_cross.score(X_train.values, y=y_train.values, verbose=1)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 246.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 1092.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 1599.8min finished


generating 51 samples of indices to use to train multiple estimators,               sized 5718 elements with last being 4212 elements


[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  2.8min finished
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  2.9min finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 247.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 1101.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 1610.4min finished


generating 51 samples of indices to use to train multiple estimators,               sized 5718 elements with last being 4212 elements


[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  2.8min finished
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  2.8min finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 249.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 1104.5min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 1613.4min finished


generating 51 samples of indices to use to train multiple estimators,               sized 5723 elements with last being 4014 elements


[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  2.9min finished
[Parallel(n_jobs=1)]: Done  51 out of  51 | elapsed:  2.9min finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 4853.9min finished


## The scores of the 3 folds of the outer loop

In [21]:
scores

(85.340037757051391, 81.295098816587654, 81.948790936809132)

In [22]:
save_search(nested_cross, './res/nested_cross_repreated_rf_large_20170214.pkl')

In [6]:
nested_cross = load_search('./res/nested_cross_repreated_rf_large_20170214.pkl')

In [7]:
extract_scores_from_nested(nested_cross.test_score_datas_).mean().sort_index()

assumed_brier                                                      0.0301626
assumed_brier_neg                                                  0.0292549
assumed_f1                                                          0.140726
assumed_f1beta10                                                    0.703419
confusion_matrix_lab         [[253.333333333, 34.0], [130.666666667, 426.0]]
confusion_matrix_un        [[111890.666667, 5071.33333333], [130.66666666...
fn_confusion_matrix_lab                                              130.667
fn_confusion_matrix_un                                               130.667
fp_confusion_matrix_lab                                                   34
fp_confusion_matrix_un                                               5071.33
labeled_acc                                                         0.804895
labeled_avg_prec                                                    0.923235
labeled_brier                                                       0.171904

# Let's dive in and see the parameters for one of the best models

In [24]:
[est.best_params_ for est in nested_cross.estimators_]

[{'base_estimator__base_estimator__bootstrap': True,
  'base_estimator__base_estimator__class_weight': None,
  'base_estimator__base_estimator__criterion': 'entropy',
  'base_estimator__base_estimator__max_depth': 93,
  'base_estimator__base_estimator__max_features': 49,
  'base_estimator__base_estimator__min_samples_leaf': 9,
  'base_estimator__base_estimator__min_samples_split': 0.005,
  'base_estimator__base_estimator__n_estimators': 216,
  'base_estimator__sample_imbalance': 0.24168487227538701,
  'pu_learning': True},
 {'base_estimator__base_estimator__bootstrap': True,
  'base_estimator__base_estimator__class_weight': None,
  'base_estimator__base_estimator__criterion': 'entropy',
  'base_estimator__base_estimator__max_depth': 93,
  'base_estimator__base_estimator__max_features': 49,
  'base_estimator__base_estimator__min_samples_leaf': 9,
  'base_estimator__base_estimator__min_samples_split': 0.005,
  'base_estimator__base_estimator__n_estimators': 216,
  'base_estimator__sample

In [9]:
score_grid = extract_score_grid(nested_cross.estimators_[0])

In [12]:
score_grid.mean_assumed_f1beta10_test.idxmax()

18

In [13]:
test_clf = score_grid.iloc[18]
cols = [col for col in test_clf.index if 'mean' in col and 'test' in col]
test_clf[cols].sort_index()

mean_assumed_brier_neg_test          0.0362089
mean_assumed_brier_test              0.0370337
mean_assumed_f1_test                  0.113537
mean_assumed_f1beta10_test             0.71061
mean_confusion_matrix_lab_test             NaN
mean_confusion_matrix_un_test              NaN
mean_fn_confusion_matrix_lab_test           76
mean_fn_confusion_matrix_un_test            76
mean_fp_confusion_matrix_lab_test           23
mean_fp_confusion_matrix_un_test          4534
mean_labeled_acc_test                 0.823949
mean_labeled_avg_prec_test            0.929029
mean_labeled_brier_neg_test          0.0787062
mean_labeled_brier_pos_test            0.21039
mean_labeled_brier_test               0.165592
mean_labeled_f1_test                  0.856289
mean_labeled_prec_test                0.927756
mean_labeled_recall_test              0.795148
mean_labeled_roc_auc_test             0.837487
mean_pr_one_unlabeled_test           0.0579947
mean_pu_mix_assumed_f1beta10_test      81.3285
mean_pu_score

## Let's see what fbeta10 looks like

In [25]:
fi = np.array([sub_est.feature_importances_ for sub_est in nested_cross.estimators_[0].best_estimator_.base_estimator.estimators_])

In [26]:
fi.shape

(51, 287)

In [27]:
pd.DataFrame(np.mean(fi, axis=0), index=X_test.columns).sort_values(by=0,ascending=False)

Unnamed: 0,0
DIAG_FLAG4_Sum,0.2920396
DIAG_FLAG5_Sum,0.18051
ndc_cat58_Sum,0.04796033
CPT_FLAG9_Sum,0.02858056
ndc_cat87_Sum,0.02714204
DIAG_FLAG75_Sum,0.0258652
age,0.02282139
ndc_cat54_Sum,0.01475768
CPT_FLAG43_Sum,0.01449112
ndc_cat85_Sum,0.0103865
