In [1]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [10]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler

In [3]:
from loadcreon import LoadCreon
from creonmetrics import pu_scorer, prior_squared_error_scorer_015, \
    brier_score_labeled_loss_scorer, f1_assumed_scorer, f1_labeled_scorer
from semisuperhelper import SemiSupervisedHelper

In [4]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

## Let's stratify 80/20 train test holdout for future use

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

## Try clustering / PCA

In [9]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA

In [42]:
estimators = [('scaler', MaxAbsScaler()), ('pca', PCA(n_components=10)), ('cluster', MiniBatchKMeans(verbose=100))]
pipe = Pipeline(estimators)

In [43]:
pipe.fit(X_train.values)

Init 1/3 with method: k-means++
Inertia for init 1/3: 13.390378
Init 2/3 with method: k-means++
Inertia for init 2/3: 13.072576
Init 3/3 with method: k-means++
Inertia for init 3/3: 14.353279
Minibatch iteration 1/352600: mean batch inertia: 0.041175, ewa inertia: 0.041175 
Minibatch iteration 2/352600: mean batch inertia: 0.055984, ewa inertia: 0.041184 
Minibatch iteration 3/352600: mean batch inertia: 0.041610, ewa inertia: 0.041184 
Minibatch iteration 4/352600: mean batch inertia: 0.044807, ewa inertia: 0.041186 
Minibatch iteration 5/352600: mean batch inertia: 0.046404, ewa inertia: 0.041189 
Minibatch iteration 6/352600: mean batch inertia: 0.053189, ewa inertia: 0.041196 
Minibatch iteration 7/352600: mean batch inertia: 0.040598, ewa inertia: 0.041195 
Minibatch iteration 8/352600: mean batch inertia: 0.038216, ewa inertia: 0.041194 
Minibatch iteration 9/352600: mean batch inertia: 0.037778, ewa inertia: 0.041192 
Minibatch iteration 10/352600: mean batch inertia: 0.039413, 

Pipeline(steps=[('scaler', MaxAbsScaler(copy=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('cluster', MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=8,
        n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=100))])

In [44]:
clusters_train = pipe.predict(X_train.values)

Computing label assignment and total inertia


In [46]:
np.bincount(clusters_train)

array([31615, 32941, 74976, 64922, 47669, 64209, 11659, 24565], dtype=int64)

In [49]:
pd.Series(clusters_train[y_train.values==0]).value_counts()

3    228
2    174
0    134
4     97
7     85
5     83
6     47
1     14
dtype: int64

## LASSO pipeline

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from pnuwrapper import PNUWrapper

In [59]:
estimators = [('scaler', MaxAbsScaler()),
              ('clf',PNUWrapper(base_estimator=LogisticRegression(penalty='l1', C=2)))]
pipe = Pipeline(estimators)

In [60]:
from sklearn.model_selection import cross_val_score, cross_val_predict
import creonmetrics

In [61]:
scores = cross_val_score(pipe, X_train.values, y_train.values, cv=5, scoring=f1_labeled_scorer, n_jobs=4)

In [62]:
scores

array([ 0.98787879,  0.98484848,  0.98480243,  0.98325723,  0.98787879])

In [63]:
scores.mean(), scores.std()*2

(0.98573314440991511, 0.0036863612019587418)

In [64]:
pipe.fit(X_train.values, y_train.values)

Pipeline(steps=[('scaler', MaxAbsScaler(copy=True)), ('clf', PNUWrapper(base_estimator=LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
      num_unlabeled=0.0, random_state=None, threshold_set_pct=None))])

In [65]:
creonmetrics.report_metrics(pipe, X_test.values, y_test.values)

assumed_brier       0.984081
assumed_f1          0.009285
assumed_f1beta10    0.319280
labeled_acc         0.987362
labeled_avg_prec    0.996750
labeled_brier       0.011853
labeled_f1          0.990338
labeled_prec        1.000000
labeled_recall      0.980861
labeled_roc_auc     0.990431
pr_one_unlabeled    0.999783
pu_score            0.964738
dtype: float64

In [66]:
pipe.get_params()

{'clf': PNUWrapper(base_estimator=LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
       num_unlabeled=0.0, random_state=None, threshold_set_pct=None),
 'clf__base_estimator': LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'clf__base_estimator__C': 2,
 'clf__base_estimator__class_weight': None,
 'clf__base_estimator__dual': False,
 'clf__base_estimator__fit_intercept': True,
 'clf__base_estimator__intercept_scaling': 1,
 'clf__base_estimator__max_iter': 100,
 'clf__base_estimator__multi_class': 'ovr',
 'clf__base_estimator__n_jobs': 1,
 'clf__base_estima

In [67]:
param_search = {'clf__base_estimator__C': sp.stats.expon(scale=4), 
               'clf__num_unlabeled': sp.stats.randint(low=2000, high=10000)}

In [68]:
random_log_regress = RandomizedSearchCV(pipe, param_search, n_iter=20, scoring=pu_scorer, n_jobs=-1, cv=5)

In [205]:
double_cross_scores_log = cross_val_score(random_log_regress, X_train.values, y_train.values, cv=2, scoring=pu_scorer, n_jobs=-1)

In [206]:
double_cross_scores_log

array([ 0.72420223,  0.97423055])

In [69]:
random_log_regress.fit(X_train.values, y_train.values)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('scaler', MaxAbsScaler(copy=True)), ('clf', PNUWrapper(base_estimator=LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
      num_unlabeled=0.0, random_state=None, threshold_set_pct=None))]),
          fit_params={}, iid=True, n_iter=20, n_jobs=-1,
          param_distributions={'clf__base_estimator__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000E27F390>, 'clf__num_unlabeled': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000E27FC88>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=make_scorer(pu_score),
          verbose=0)

In [70]:
random_log_regress.best_params_

{'clf__base_estimator__C': 6.6552987801776577, 'clf__num_unlabeled': 6014}

In [71]:
random_log_regress.best_score_

4.7995903126465533

In [220]:
random_log_regress.cv_results_

{'mean_fit_time': array([ 6.07199998,  6.84040003,  6.41475997,  6.23632002,  7.05251999,
         6.37104001,  6.62064009,  5.75971999,  6.77391996,  6.72996006,
         6.26087995,  5.98415995,  5.65988002,  6.68032007,  6.53327994,
         6.56148005,  7.20840001,  6.98012004,  6.51079998,  5.53500004]),
 'mean_score_time': array([ 0.74236007,  0.51299996,  0.45552001,  0.46487999,  0.48360004,
         0.47423992,  0.39312   ,  0.51792006,  0.63336   ,  0.62087994,
         0.48156004,  0.55536003,  0.38687997,  0.51791992,  0.65208006,
         0.49607992,  0.46176   ,  0.48379989,  0.50231996,  0.38171997]),
 'mean_test_score': array([ 3.37972592,  4.60649412,  4.2117194 ,  4.38025589,  3.76724907,
         4.60651766,  4.57080159,  4.73064426,  4.37933608,  4.46379385,
         2.71753929,  4.64297667,  3.61092503,  4.59440701,  2.23583535,
         2.49033771,  4.53421867,  4.56838823,  3.81706648,  3.61380793]),
 'mean_train_score': array([ 3.39202669,  5.06057872,  4.213406

In [221]:
best = random_log_regress.best_estimator_

In [222]:
y_predict = best.predict(X_test.values)

In [223]:
y_true = y_test.values

In [231]:
from creonmetrics import labeled_metric, pu_score
from sklearn.metrics import f1_score, brier_score_loss, auc, average_precision_score

In [229]:
(labeled_metric(y_true, y_predict, f1_score),
labeled_metric(y_true, y_predict, brier_score_loss),
labeled_metric(y_true, y_predict, auc, reorder=True),
labeled_metric(y_true, y_predict, average_precision_score))

(0.4473197781885398, 0.47235387045813587, 0.5, 0.87120391720255441)

In [232]:
pu_score(y_true, y_predict)

5.4708833487226842