In [2]:
import pandas as pd
import numpy as np
import scipy as sp
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
%load_ext autoreload
%autoreload 2

In [3]:
from loadcreon import LoadCreon
from creonmetrics import pu_scorer, prior_squared_error_scorer_015, brier_score_labeled_loss_scorer
from semisuperhelper import SemiSupervisedHelper

In [4]:
path = "C:\Data\\010317\membership14_final_0103.txt"
lc = LoadCreon(path)

## Let's stratify 80/20 train test holdout for future use

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(lc.X, lc.y, test_size=0.2, random_state=771, stratify=lc.y)

## Let's just train on PN data for now

In [111]:
semi_helper = SemiSupervisedHelper(y_train.values)

In [112]:
X_train_pn, y_train_pn = semi_helper.pn(X_train.values)

In [128]:
X_t, y_t = semi_helper.pn_assume(X_train.values, unlabeled_pct=345353453)

In [129]:
X_t.shape, y_t.shape

((352556, 288), (352556,))

In [130]:
y_t[:20], y_t[-20:]

(array([1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0], dtype=int64),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))

## Quick Scaler and LASSO to test run-time

In [139]:
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from pnuwrapper import PNUWrapper

In [170]:
estimators = [('scaler', MaxAbsScaler()),
              ('clf',PNUWrapper(base_estimator=LogisticRegression(penalty='l1', C=10), 
                                num_unlabeled=5819))]
pipe = Pipeline(estimators)

In [171]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from creonmetrics import pu_scorer

In [172]:
%%time
scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring=pu_scorer, n_jobs=4)

Wall time: 14.2 s


In [173]:
scores

array([ 4.96740673,  5.12783727,  3.78494382,  5.00711597,  4.70779884])

In [174]:
scores.mean(), scores.std()*2

(4.719020525554388, 0.97342736126165508)

In [175]:
X_train_u, y_u = semi_helper.u(X_train)

In [177]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', MaxAbsScaler(copy=True)), ('clf', PNUWrapper(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
      num_unlabeled=5819))])

In [178]:
y_predict_u = pipe.predict(X_train_u)

In [179]:
pd.Series(y_predict_u).value_counts()

0    335573
1     14451
dtype: int64

In [180]:
pipe.get_params()

{'clf': PNUWrapper(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
       num_unlabeled=5819),
 'clf__base_estimator': LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'clf__base_estimator__C': 10,
 'clf__base_estimator__class_weight': None,
 'clf__base_estimator__dual': False,
 'clf__base_estimator__fit_intercept': True,
 'clf__base_estimator__intercept_scaling': 1,
 'clf__base_estimator__max_iter': 100,
 'clf__base_estimator__multi_class': 'ovr',
 'clf__base_estimator__n_jobs': 1,
 'clf__base_estimator__penalty': 'l1',
 'clf__base_estima

In [188]:
pipe.named_steps['clf'].base_estimator.coef_

array([[  1.69322822e+00,   0.00000000e+00,   8.38148066e+01,
          7.06074618e+00,   5.32545252e+01,   6.07643651e+01,
         -7.96455145e+00,  -9.55387802e-01,  -1.00547151e+01,
         -2.26625256e+01,  -1.74198938e+01,  -2.14548174e+01,
          3.19566412e+01,   0.00000000e+00,   0.00000000e+00,
          2.18567414e+00,   1.54837057e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,  -1.83695469e+01,
         -8.65570383e+00,  -1.04783205e+00,   1.40143810e+01,
         -6.36703387e+00,   0.00000000e+00,   0.00000000e+00,
         -8.40208294e+00,   0.00000000e+00,   9.08820465e+00,
         -1.59831502e+01,   3.52190504e+00,   5.52701197e+00,
          4.24825300e-01,   0.00000000e+00,  -6.57788617e+00,
          0.00000000e+00,  -2.04388310e+00,   0.00000000e+00,
         -7.92311300e+00,  -5.66328865e-01,   1.00817461e+01,
          5.01314920e+00,   7.94842146e+00,   0.00000000e+00,
        