In [1]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.svm import SVC

In [3]:
from loadcreon import LoadCreon
from pnuwrapper import PNUWrapper
from nestedcross import NestedCV
from jeffsearchcv import JeffRandomSearchCV
from frankenscorer import FrankenScorer, extract_scores_from_nested, extract_score_grid
from blagging import BlaggingClassifier

In [4]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

## Let's stratify 80/20 train test holdout for future use

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

## SVC pipeline and model on labeled data (PN)

In [7]:
estimators = [('scaler', MaxAbsScaler()),
              ('clf',PNUWrapper(base_estimator=SVC(C=1.0, kernel='linear', probability=True, class_weight='balanced')))]
pipe = Pipeline(estimators)

In [8]:
pipe.fit(X_train.values, y_train.values)

Pipeline(steps=[('scaler', MaxAbsScaler(copy=True)), ('clf', PNUWrapper(base_estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
      num_unlabeled=0.0, pu_learning=False, random_state=None,
      threshold_set_pct=None))])

In [9]:
FrankenScorer()(pipe, X_test.values, y_test.values)

({'SCORE': 0.9903381642512078,
  'assumed_brier': 0.98987546182088737,
  'assumed_brier_neg': 0.99451180725273935,
  'assumed_f1': 0.009284840798949227,
  'assumed_f1beta10': 0.31928017394254343,
  'confusion_matrix_lab': array([[215,   0],
         [  8, 410]]),
  'confusion_matrix_un': array([[  234, 87488],
         [    8,   410]]),
  'labeled_acc': 0.9873617693522907,
  'labeled_avg_prec': 0.99674973733342409,
  'labeled_brier': 0.012335177411497278,
  'labeled_brier_neg': 0.0034870615501299967,
  'labeled_brier_pos': 0.016886241789951739,
  'labeled_f1': 0.9903381642512078,
  'labeled_prec': 1.0,
  'labeled_recall': 0.98086124401913877,
  'labeled_roc_auc': 0.99043062200956933,
  'pr_one_unlabeled': 0.99978287451289616,
  'pu_score': 0.96473759438047069},
 0.9903381642512078)

## Nested Cross grid search for pipline with PNU wrapper

In [8]:
param_search = {'clf__base_estimator__C': sp.stats.expon(scale=4), 
               'clf__num_unlabeled': sp.stats.randint(low=1000, high=8000),
               'clf__base_estimator__class_weight': [None, 'balanced'],
               'clf__base_estimator__kernel': ['linear','rbf'],
               'clf__pu_learning': [True, False],
               'clf__base_estimator__gamma': ['auto',0.1,0.5,1.0,1.5,2.0,2.5,3.0,5.0,9.0]}

In [9]:
jeffsearch = JeffRandomSearchCV(pipe, param_search, n_iter=20, 
                            scoring=FrankenScorer('assumed_f1beta10'), n_jobs=-1, cv=3, verbose=100)

In [10]:
nested_cross = NestedCV(jeffsearch, scoring=FrankenScorer('assumed_f1beta10'), cv=3)

In [None]:
nested_cross.score(X_train.values, y_train.values, n_jobs=-1, verbose=100)

Pickling array (shape=(624,), dtype=uint32).
Memmaping (shape=(352556, 288), dtype=int64) to new file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_2776_193460768\2776-201292096-d47a562ce4fdf84a63c4c4a1b92e0b10.pkl
Memmaping (shape=(352556,), dtype=int64) to new file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_2776_193460768\2776-201292096-37e9d47f2d553d0c8c1a28b850ef16f7.pkl
Memmaping (shape=(264416,), dtype=int32) to new file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_2776_193460768\2776-201292096-2fd06626ad60dca3852b81dee263d905.pkl
Pickling array (shape=(88140,), dtype=int32).
Pickling array (shape=(624,), dtype=uint32).
Memmaping (shape=(352556, 288), dtype=int64) to old file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_2776_193460768\2776-201292096-d47a562ce4fdf84a63c4c4a1b92e0b10.pkl
Memmaping (shape=(352556,), dtype=int64) to old file C:\Users\JEFFRE~1.GOM\AppData\Local\Temp\joblib_memmaping_pool_2776_193

In [None]:
extract_scores_from_nested(nested_cross.test_score_datas_).mean()

In [None]:
[est.best_params_ for est in nested_cross.estimators_]