In [195]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [196]:
from sklearn.model_selection import RandomizedSearchCV

In [197]:
from loadcreon import LoadCreon
from creonmetrics import pu_scorer, prior_squared_error_scorer_015, \
    brier_score_labeled_loss_scorer, f1_assumed_scorer, f1_labeled_scorer
from semisuperhelper import SemiSupervisedHelper

In [3]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

## Let's stratify 80/20 train test holdout for future use

In [198]:
from sklearn.model_selection import train_test_split

In [199]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

## Let's just train on PN data for now

In [162]:
semi_helper = SemiSupervisedHelper(y_train.values)

In [141]:
X_train_pn, y_train_pn = semi_helper.pn(X_train.values)

## Quick Scaler and LASSO to test run-time

In [200]:
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from pnuwrapper import PNUWrapper

In [201]:
estimators = [('scaler', MaxAbsScaler()),
              ('clf',PNUWrapper(base_estimator=LogisticRegression(penalty='l1', C=10), 
                                num_unlabeled=5819, threshold_set_pct=0.0143))]
pipe = Pipeline(estimators)

In [202]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from creonmetrics import pu_scorer

In [166]:
scores = cross_val_score(pipe, X_train.values, y_train.values, cv=5, scoring=f1_assumed_scorer, n_jobs=4)

In [167]:
scores

array([ 0.14577657,  0.11467577,  0.11382114,  0.13728324,  0.13105413])

In [168]:
scores.mean(), scores.std()*2

(0.12852216818706297, 0.025119307560292757)

In [169]:
pipe.get_params()

{'clf': PNUWrapper(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
       num_unlabeled=5819, random_state=None, threshold_set_pct=0.0143),
 'clf__base_estimator': LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'clf__base_estimator__C': 10,
 'clf__base_estimator__class_weight': None,
 'clf__base_estimator__dual': False,
 'clf__base_estimator__fit_intercept': True,
 'clf__base_estimator__intercept_scaling': 1,
 'clf__base_estimator__max_iter': 100,
 'clf__base_estimator__multi_class': 'ovr',
 'clf__base_estimator__n_jobs': 1,
 'clf__base_

In [215]:
param_search = {'clf__base_estimator__C': sp.stats.expon(scale=4), 
               'clf__num_unlabeled': sp.stats.randint(low=2000, high=10000)}

In [216]:
random_log_regress = RandomizedSearchCV(pipe, param_search, n_iter=20, scoring=pu_scorer, n_jobs=-1, cv=5)

In [205]:
double_cross_scores_log = cross_val_score(random_log_regress, X_train.values, y_train.values, cv=2, scoring=pu_scorer, n_jobs=-1)

In [206]:
double_cross_scores_log

array([ 0.72420223,  0.97423055])

In [217]:
random_log_regress.fit(X_train.values, y_train.values)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('scaler', MaxAbsScaler(copy=True)), ('clf', PNUWrapper(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
      num_unlabeled=5819, random_state=None, threshold_set_pct=0.0143))]),
          fit_params={}, iid=True, n_iter=20, n_jobs=-1,
          param_distributions={'clf__base_estimator__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000D7EE2E8>, 'clf__num_unlabeled': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000D7DE828>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=make_scorer(pu_score),
          verbose=0)

In [218]:
random_log_regress.best_params_

{'clf__base_estimator__C': 2.1079407454966477, 'clf__num_unlabeled': 4477}

In [219]:
random_log_regress.best_score_

4.7306442628930467

In [220]:
random_log_regress.cv_results_

{'mean_fit_time': array([ 6.07199998,  6.84040003,  6.41475997,  6.23632002,  7.05251999,
         6.37104001,  6.62064009,  5.75971999,  6.77391996,  6.72996006,
         6.26087995,  5.98415995,  5.65988002,  6.68032007,  6.53327994,
         6.56148005,  7.20840001,  6.98012004,  6.51079998,  5.53500004]),
 'mean_score_time': array([ 0.74236007,  0.51299996,  0.45552001,  0.46487999,  0.48360004,
         0.47423992,  0.39312   ,  0.51792006,  0.63336   ,  0.62087994,
         0.48156004,  0.55536003,  0.38687997,  0.51791992,  0.65208006,
         0.49607992,  0.46176   ,  0.48379989,  0.50231996,  0.38171997]),
 'mean_test_score': array([ 3.37972592,  4.60649412,  4.2117194 ,  4.38025589,  3.76724907,
         4.60651766,  4.57080159,  4.73064426,  4.37933608,  4.46379385,
         2.71753929,  4.64297667,  3.61092503,  4.59440701,  2.23583535,
         2.49033771,  4.53421867,  4.56838823,  3.81706648,  3.61380793]),
 'mean_train_score': array([ 3.39202669,  5.06057872,  4.213406

In [221]:
best = random_log_regress.best_estimator_

In [222]:
y_predict = best.predict(X_test.values)

In [223]:
y_true = y_test.values

In [231]:
from creonmetrics import labeled_metric, pu_score
from sklearn.metrics import f1_score, brier_score_loss, auc, average_precision_score

In [229]:
(labeled_metric(y_true, y_predict, f1_score),
labeled_metric(y_true, y_predict, brier_score_loss),
labeled_metric(y_true, y_predict, auc, reorder=True),
labeled_metric(y_true, y_predict, average_precision_score))

(0.4473197781885398, 0.47235387045813587, 0.5, 0.87120391720255441)

In [232]:
pu_score(y_true, y_predict)

5.4708833487226842