# SETUP

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.svm import SVC

In [5]:
from creon.loadcreon import LoadCreon
from creon.creonsklearn.pnuwrapper import PNUWrapper
from creon.creonsklearn.nestedcross import NestedCV
from creon.creonsklearn.jeffsearchcv import JeffRandomSearchCV, extract_score_grid
from creon.creonsklearn.frankenscorer import FrankenScorer, extract_scores_from_nested

In [6]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

## Let's stratify 80/20 train test holdout for future use

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

## SVC pipeline and model on labeled data (PN)

In [9]:
estimators = [('scaler', MaxAbsScaler()),
              ('clf',PNUWrapper(base_estimator=SVC(C=1.0, kernel='linear', probability=True, class_weight='balanced')))]
pipe = Pipeline(estimators)

In [10]:
pipe.fit(X_train.values, y_train.values)

Pipeline(steps=[('scaler', MaxAbsScaler(copy=True)), ('clf', PNUWrapper(base_estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
      num_unlabeled=0.0, pu_learning=False, random_state=None,
      threshold_set_pct=None))])

In [11]:
FrankenScorer()(pipe, X_test.values, y_test.values)

({'SCORE': 0.87792848335388396,
  'assumed_brier': 0.58442634604747512,
  'assumed_brier_neg': 0.58687331669745169,
  'assumed_f1': 0.010333217229768954,
  'assumed_f1beta10': 0.32602506211123805,
  'confusion_matrix_lab': array([[178,  37],
         [ 62, 356]]),
  'confusion_matrix_un': array([[19592, 68130],
         [   62,   356]]),
  'labeled_acc': 0.84360189573459721,
  'labeled_avg_prec': 0.92773667298543572,
  'labeled_brier': 0.10836180995979061,
  'labeled_brier_neg': 0.18119056936725547,
  'labeled_brier_pos': 0.070902041365041954,
  'labeled_f1': 0.87792848335388396,
  'labeled_prec': 0.90585241730279897,
  'labeled_recall': 0.85167464114832536,
  'labeled_roc_auc': 0.83979080894625568,
  'pr_one_unlabeled': 0.77814346280868962,
  'pu_mix_assumed_f1beta10': 33.536015571609504,
  'pu_score': 0.93350936048570299},
 0.87792848335388396)

## 3-Fold Nested Cross grid search for pipline with PNU wrapper

In [12]:
param_search = {'clf__base_estimator__C': sp.stats.expon(scale=4), 
               'clf__num_unlabeled': sp.stats.randint(low=1000, high=8000),
               'clf__base_estimator__class_weight': [None, 'balanced'],
               'clf__base_estimator__kernel': ['linear','rbf'],
               'clf__pu_learning': [True, False],
               'clf__base_estimator__gamma': ['auto',0.1,0.5,1.0,1.5,2.0,2.5,3.0,5.0,9.0]}

In [13]:
jeffsearch = JeffRandomSearchCV(pipe, param_search, n_iter=20, 
                            scoring=FrankenScorer('assumed_f1beta10'), n_jobs=-1, cv=3, verbose=1)

In [14]:
nested_cross = NestedCV(jeffsearch, scoring=FrankenScorer('assumed_f1beta10'), cv=3, random_state=721)

In [15]:
nested_cross.score(X_train.values, y_train.values, n_jobs=1, verbose=1)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 83.7min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 113.8min finished


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 82.9min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 113.2min finished


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 83.7min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 114.6min finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 383.8min finished


(0.53684978191660937, 0.51189951520493615, 0.52601275215742316)

In [16]:
extract_scores_from_nested(nested_cross.test_score_datas_).mean().sort_index()

assumed_brier                                                      0.0346932
assumed_brier_neg                                                  0.0328337
assumed_f1                                                         0.0439349
assumed_f1beta10                                                    0.524921
confusion_matrix_lab       [[175.666666667, 111.666666667], [181.33333333...
confusion_matrix_un        [[100776.333333, 16185.6666667], [181.33333333...
fn_confusion_matrix_lab                                              181.333
fn_confusion_matrix_un                                               181.333
fp_confusion_matrix_lab                                              111.667
fp_confusion_matrix_un                                               16185.7
labeled_acc                                                         0.652836
labeled_avg_prec                                                    0.830134
labeled_brier                                                       0.311251

In [17]:
extract_scores_from_nested(nested_cross.test_score_datas_).std().sort_index()

assumed_brier                0.000430
assumed_brier_neg            0.000549
assumed_f1                   0.002591
assumed_f1beta10             0.012511
fn_confusion_matrix_lab      6.658328
fn_confusion_matrix_un       6.658328
fp_confusion_matrix_lab     17.009801
fp_confusion_matrix_un     842.380160
labeled_acc                  0.027273
labeled_avg_prec             0.016533
labeled_brier                0.019079
labeled_brier_neg            0.006566
labeled_brier_pos            0.025440
labeled_f1                   0.019015
labeled_prec                 0.029635
labeled_recall               0.011771
labeled_roc_auc              0.035075
pr_one_unlabeled             0.007110
pu_mix_assumed_f1beta10      1.474925
pu_score                     0.231366
tn_confusion_matrix_lab     17.502381
tn_confusion_matrix_un     843.311528
tp_confusion_matrix_lab      6.350853
tp_confusion_matrix_un       6.350853
dtype: float64

In [18]:
[est.best_params_ for est in nested_cross.estimators_]

[{'clf__base_estimator__C': 7.1311952396509097,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__gamma': 'auto',
  'clf__base_estimator__kernel': 'linear',
  'clf__num_unlabeled': 5117,
  'clf__pu_learning': True},
 {'clf__base_estimator__C': 7.1311952396509097,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__gamma': 'auto',
  'clf__base_estimator__kernel': 'linear',
  'clf__num_unlabeled': 5117,
  'clf__pu_learning': True},
 {'clf__base_estimator__C': 7.1311952396509097,
  'clf__base_estimator__class_weight': 'balanced',
  'clf__base_estimator__gamma': 'auto',
  'clf__base_estimator__kernel': 'linear',
  'clf__num_unlabeled': 5117,
  'clf__pu_learning': True}]